from:"Eli Cohen"

[ofa-general] [PATCH] mlx4: remove limitation on LSO header size

2009-09-30 Thread Eli Cohen

Current code has a limitation as for the size of an LSO header not allowed to
cross a 64 byte boundary. This patch removes this limitation by setting the WQE
RR for large headers thus allowing LSO headers of any size. The extra buffer
reserved for MLX4_IB_QP_LSO QPs has been doubled, from 64 to 128 bytes,
assuming this is reasonable upper limit to header length.
Also, this patch will cause IB_DEVICE_UD_TSO to be set only of FW versions that
set MLX4_DEV_CAP_FLAG_BLH; e.g. FW version 2.6.000 and higher.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/infiniband/hw/mlx4/main.c |2 +-
 drivers/infiniband/hw/mlx4/qp.c   |   17 +++--
 include/linux/mlx4/device.h   |1 +
 3 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/main.c 
b/drivers/infiniband/hw/mlx4/main.c
index 3cb3f47..e596537 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -103,7 +103,7 @@ static int mlx4_ib_query_device(struct ib_device *ibdev,
props-device_cap_flags |= IB_DEVICE_UD_AV_PORT_ENFORCE;
if (dev-dev-caps.flags  MLX4_DEV_CAP_FLAG_IPOIB_CSUM)
props-device_cap_flags |= IB_DEVICE_UD_IP_CSUM;
-   if (dev-dev-caps.max_gso_sz)
+   if (dev-dev-caps.max_gso_sz  dev-dev-caps.flags  
MLX4_DEV_CAP_FLAG_BLH)
props-device_cap_flags |= IB_DEVICE_UD_TSO;
if (dev-dev-caps.bmme_flags  MLX4_BMME_FLAG_RESERVED_LKEY)
props-device_cap_flags |= IB_DEVICE_LOCAL_DMA_LKEY;
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 219b103..1b356cf 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -261,7 +261,7 @@ static int send_wqe_overhead(enum ib_qp_type type, u32 
flags)
case IB_QPT_UD:
return sizeof (struct mlx4_wqe_ctrl_seg) +
sizeof (struct mlx4_wqe_datagram_seg) +
-   ((flags  MLX4_IB_QP_LSO) ? 64 : 0);
+   ((flags  MLX4_IB_QP_LSO) ? 128 : 0);
case IB_QPT_UC:
return sizeof (struct mlx4_wqe_ctrl_seg) +
sizeof (struct mlx4_wqe_raddr_seg);
@@ -1467,16 +1467,11 @@ static void __set_data_seg(struct mlx4_wqe_data_seg 
*dseg, struct ib_sge *sg)
 
 static int build_lso_seg(struct mlx4_wqe_lso_seg *wqe, struct ib_send_wr *wr,
 struct mlx4_ib_qp *qp, unsigned *lso_seg_len,
-__be32 *lso_hdr_sz)
+__be32 *lso_hdr_sz, int *blh)
 {
unsigned halign = ALIGN(sizeof *wqe + wr-wr.ud.hlen, 16);
 
-   /*
-* This is a temporary limitation and will be removed in
-* a forthcoming FW release:
-*/
-   if (unlikely(halign  64))
-   return -EINVAL;
+   *blh = unlikely(halign  64) ? 1 : 0;
 
if (unlikely(!(qp-flags  MLX4_IB_QP_LSO) 
 wr-num_sge  qp-sq.max_gs - (halign  4)))
@@ -1523,6 +1518,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct 
ib_send_wr *wr,
__be32 *lso_wqe;
__be32 uninitialized_var(lso_hdr_sz);
int i;
+   int blh = 0;
 
spin_lock_irqsave(qp-sq.lock, flags);
 
@@ -1616,7 +1612,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct 
ib_send_wr *wr,
size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
 
if (wr-opcode == IB_WR_LSO) {
-   err = build_lso_seg(wqe, wr, qp, seglen, 
lso_hdr_sz);
+   err = build_lso_seg(wqe, wr, qp, seglen, 
lso_hdr_sz, blh);
if (unlikely(err)) {
*bad_wr = wr;
goto out;
@@ -1687,7 +1683,8 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct 
ib_send_wr *wr,
}
 
ctrl-owner_opcode = mlx4_ib_opcode[wr-opcode] |
-   (ind  qp-sq.wqe_cnt ? cpu_to_be32(1  31) : 0);
+   (ind  qp-sq.wqe_cnt ? cpu_to_be32(1  31) : 0) |
+   (blh ? cpu_to_be32(1  6) : 0);
 
stamp = ind + qp-sq_spare_wqes;
ind += DIV_ROUND_UP(size * 16, 1U  qp-sq.wqe_shift);
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index ce7cc6c..e92d1bf 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -61,6 +61,7 @@ enum {
MLX4_DEV_CAP_FLAG_BAD_PKEY_CNTR = 1   8,
MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR = 1   9,
MLX4_DEV_CAP_FLAG_DPDP  = 1  12,
+   MLX4_DEV_CAP_FLAG_BLH   = 1  15,
MLX4_DEV_CAP_FLAG_MEM_WINDOW= 1  16,
MLX4_DEV_CAP_FLAG_APM   = 1  17,
MLX4_DEV_CAP_FLAG_ATOMIC= 1  18,
-- 
1.6.4.3

___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

[ofa-general] [PATCH v2] mlx4: configure cache line size

2009-09-23 Thread Eli Cohen

ConnectX can work more efficiently if the CPU cache line size is configured to
it at INIT_HCA. This patch configures the CPU cache line size.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
As per Roland's comments, the following changes were made:
1. Remove #ifdef cache_line_size and include linux/cache.h
2. Assume cache line size is a power of 2 and use ilog2 instead of
order_base_2

 drivers/net/mlx4/fw.c |5 +
 1 files changed, 5 insertions(+), 0 deletions(-)

diff --git a/drivers/net/mlx4/fw.c b/drivers/net/mlx4/fw.c
index cee199c..3c16602 100644
--- a/drivers/net/mlx4/fw.c
+++ b/drivers/net/mlx4/fw.c
@@ -33,6 +33,7 @@
  */
 
 #include linux/mlx4/cmd.h
+#include linux/cache.h
 
 #include fw.h
 #include icm.h
@@ -698,6 +699,7 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct 
mlx4_init_hca_param *param)
 #define INIT_HCA_IN_SIZE0x200
 #define INIT_HCA_VERSION_OFFSET 0x000
 #define INIT_HCA_VERSION2
+#define INIT_HCA_CACHELINE_SZ_OFFSET0x0e
 #define INIT_HCA_FLAGS_OFFSET   0x014
 #define INIT_HCA_QPC_OFFSET 0x020
 #define INIT_HCA_QPC_BASE_OFFSET(INIT_HCA_QPC_OFFSET + 0x10)
@@ -735,6 +737,9 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct 
mlx4_init_hca_param *param)
 
*((u8 *) mailbox-buf + INIT_HCA_VERSION_OFFSET) = INIT_HCA_VERSION;
 
+   *((u8 *) mailbox-buf + INIT_HCA_CACHELINE_SZ_OFFSET) =
+   (ilog2(cache_line_size()) - 4)  5;
+
 #if defined(__LITTLE_ENDIAN)
*(inbox + INIT_HCA_FLAGS_OFFSET / 4) = ~cpu_to_be32(1  1);
 #elif defined(__BIG_ENDIAN)
-- 
1.6.4.3

___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

[ofa-general] Possible process deadlock in RMPP flow

2009-09-23 Thread Eli Cohen

Hi Sean,
one of our customers experiences problems when running ibnetdiscover.
The problem happens from time to time.
Here is the call stack the he gets:

ibnetdiscover D 80149b8d 0 26968  26544
(L-TLB)
 8102c900bd88 0046 81037e8e 81037e8e02e8
 8102c900bd78 000a 8102c5b50820 81038a929820
 011837bf6105 0ede 8102c5b50a08 0001
Call Trace:
 [80064207] wait_for_completion+0x79/0xa2
 [8008b4cc] default_wake_function+0x0/0xe
 [882271d9] :ib_mad:ib_cancel_rmpp_recvs+0x87/0xde
 [88224485] :ib_mad:ib_unregister_mad_agent+0x30d/0x424
 [883983e9] :ib_umad:ib_umad_close+0x9d/0xd6
 [80012e22] __fput+0xae/0x198
 [80023de6] filp_close+0x5c/0x64
 [800393df] put_files_struct+0x63/0xae
 [80015b26] do_exit+0x31c/0x911
 [8004971a] cpuset_exit+0x0/0x6c
 [8005e116] system_call+0x7e/0x83

From the dump it seems that the process is waits on the call to
flush_workqueue() in ib_cancel_rmpp_recvs(). The package they use is
OFED 1.4.2.

Do you have any idea or suggestions how to sort this out?

Thanks.
___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

[ofa-general] Re: Possible process deadlock in RMPP flow

2009-09-23 Thread Eli Cohen

On Wed, Sep 23, 2009 at 09:08:28AM -0700, Sean Hefty wrote:
 
 Roland just submitted a patch in this area yesterday.  I don't know if the 
 patch
 would fix their issue, but it may be worth trying.  What kernel does 1.4.2 map
 to?
I think OFED 1.4.2 is based on kernel 2.6.27 but they're using RHEL
5.3.
Thanks, we'll try this.

 
 What RMPP messages does ibnetdiscover use?  If the program is completing
 successfully, there may be a different race with the rmpp cleanup.  I'll see 
 if
 anything else stands out in that area.
 
 - Sean
 
 --
 To unsubscribe from this list: send the line unsubscribe linux-rdma in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

[ofa-general] Re: [PATCH] mlx4: confiugre cache line size

2009-09-22 Thread Eli Cohen

On Tue, Sep 22, 2009 at 10:51:10AM -0700, Roland Dreier wrote:
I agree with you on both comments. Would you like me to resend or will
you make the necessary changes?

 
   +#if defined(cache_line_size)
 
 Why the #if here?  Do we just need to include linux/cache.h explicitly
 to make sure we get the define?
 
   +  *((u8 *) mailbox-buf + INIT_HCA_CACHELINE_SZ_OFFSET) =
   +  order_base_2(cache_line_size() / 16)  5;
 
 Trivial but I think it's safe to assume a cacheline is always a power of
 2.  And I think it's clearer (and avoids generating a divide) to use
 subtraction rather than division... so this could all become:
 
   (ilog2(cache_line_size()) - 4)  5;
 
  - R.
 --
 To unsubscribe from this list: send the line unsubscribe linux-rdma in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

[ofa-general] [PATCH] mlx4: confiugre cache line size

2009-09-16 Thread Eli Cohen

ConnectX can work more efficiently if the CPU cache line size is confiugred to
it at INIT_HCA. This patch configures cache line size for systems that report
it.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/net/mlx4/fw.c |7 +++
 1 files changed, 7 insertions(+), 0 deletions(-)

diff --git a/drivers/net/mlx4/fw.c b/drivers/net/mlx4/fw.c
index 20526ce..aa38c06 100644
--- a/drivers/net/mlx4/fw.c
+++ b/drivers/net/mlx4/fw.c
@@ -699,6 +699,7 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct 
mlx4_init_hca_param *param)
 #define INIT_HCA_IN_SIZE0x200
 #define INIT_HCA_VERSION_OFFSET 0x000
 #define INIT_HCA_VERSION2
+#define INIT_HCA_CACHELINE_SZ_OFFSET0x0e
 #define INIT_HCA_FLAGS_OFFSET   0x014
 #define INIT_HCA_QPC_OFFSET 0x020
 #define INIT_HCA_QPC_BASE_OFFSET(INIT_HCA_QPC_OFFSET + 0x10)
@@ -736,6 +737,12 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct 
mlx4_init_hca_param *param)
 
*((u8 *) mailbox-buf + INIT_HCA_VERSION_OFFSET) = INIT_HCA_VERSION;
 
+#if defined(cache_line_size)
+   *((u8 *) mailbox-buf + INIT_HCA_CACHELINE_SZ_OFFSET) =
+   order_base_2(cache_line_size() / 16)  5;
+#endif
+
+
 #if defined(__LITTLE_ENDIAN)
*(inbox + INIT_HCA_FLAGS_OFFSET / 4) = ~cpu_to_be32(1  1);
 #elif defined(__BIG_ENDIAN)
-- 
1.6.4.3

___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Re: [ofa-general] Better way to get sufficient EQ context memory?y

2009-08-24 Thread Eli Cohen

On Thu, Aug 20, 2009 at 02:33:41PM -0700, Roland Dreier wrote:
 Eli, it occurs to me that since we're doing more than one page for EQ
 context now, we might as well use the normal ICM table stuff that
 everything else uses.  Seems the code becomes much simpler and I don't
 think there's any real overhead added... thoughts?
Yes it look cleaner, and it works well on my 4 core system. Let's wait
for Christoph's approval on his system.

 
 (Christoph, I tested this with possible_cpus=32 and it still works for
 me -- if you get a chance on your Dell systems that would be helpful too)
 
___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Re: [ofa-general] [PATCHv5 0/10] RDMAoE support

2009-08-24 Thread Eli Cohen

Roland,

what about this series of patches? Would you like me to re-create them
over your xrc branch or would you rather take them before xrc?

On Wed, Aug 19, 2009 at 08:19:35PM +0300, Eli Cohen wrote:
 RDMA over Ethernet (RDMAoE) allows running the IB transport protocol using
 Ethernet frames, enabling the deployment of IB semantics on lossless Ethernet
 fabrics. RDMAoE packets are standard Ethernet frames with an IEEE assigned
 Ethertype, a GRH, unmodified IB transport headers and payload.  IB subnet
 management and SA services are not required for RDMAoE operation; Ethernet
 management practices are used instead. RDMAoE encodes IP addresses into its
 GIDs and resolves MAC addresses using the host IP stack. For multicast GIDs,
 standard IP to MAC mappings apply.
 
 To support RDMAoE, a new transport protocol was added to the IB core. An RDMA
 device can have ports with different transports, which are identified by a 
 port
 transport attribute.  The RDMA Verbs API is syntactically unmodified. When
 referring to RDMAoE ports, Address handles are required to contain GIDs while
 LID fields are ignored. The Ethernet L2 information is subsequently obtained 
 by
 the vendor-specific driver (both in kernel- and user-space) while modifying 
 QPs
 to RTR and creating address handles.  As there is no SA in RDMAoE, the CMA 
 code
 is modified to fill the necessary path record attributes locally before 
 sending
 CM packets. Similarly, the CMA provides to the user the required address 
 handle
 attributes when processing SIDR requests and joining multicast groups.
 
 In this patch set, an RDMAoE port is currently assigned a single GID, encoding
 the IPv6 link-local address of the corresponding netdev; the CMA RDMAoE code
 temporarily uses IPv6 link-local addresses as GIDs instead of the IP address
 provided by the user, thereby supporting any IP address.
 
 To enable RDMAoE with the mlx4 driver stack, both the mlx4_en and mlx4_ib
 drivers must be loaded, and the netdevice for the corresponding RDMAoE port
 must be running. Individual ports of a multi port HCA can be independently
 configured as Ethernet (with support for RDMAoE) or IB, as is already the 
 case.
 We have successfully tested MPI, SDP, RDS, and native Verbs applications over
 RDMAoE.
 
 Following is a series of 10 patches based on version 2.6.30 of the Linux
 kernel. This new series reflects changes based on feedback from the community
 on the previous set of patches, and is tagged v5.
 
 Changes from v4:
 1. Added rdma_is_transport_supported() and used it to simplify conditionals
 throughout the code.
 2. ib_register_mad_agent()for QP0 is only called for IB ports 3. PATCH 5/10
 changed from Enable support for RDMAoE ports to Enable support only for IB
 ports.
 4. MAD services from userspace currently not supported for RDMAoE ports.
 5. Add kref to struct cma_multicast to aid in maintaining reference count on
 the object. This is to avoid freeing the object while the worker thread is
 still using it.
 6. Return immediate error for invalid MTU when resolving an RDMAoE path 7.
 Don't fail resolve path if rate is 0 since this value stands for
 IB_RATE_PORT_CURRENT.
 8. In cma_rdmaoe_join_multicast(), fail immediately if mtu is zero.
 9. Add ucma_copy_rdmaoe_route()instead of modifying ucma_copy_ib_route().
 10. Bug fix: in PATCH 10/10, call flush_workqueue after unregistering netdev
 notifiers
 11. Multicast no longer use the broadcast MAC.
 12. No changes to patches 2, 7 and 8 from the v4 series.
 
 Signed-off-by: Eli Cohen e...@mellanox.co.il
 ---
 
  b/drivers/infiniband/core/agent.c   |   38 ++-
  b/drivers/infiniband/core/cm.c  |   25 +-
  b/drivers/infiniband/core/cma.c |   54 ++--
  b/drivers/infiniband/core/mad.c |   41 ++-
  b/drivers/infiniband/core/multicast.c   |4 
  b/drivers/infiniband/core/sa_query.c|   39 ++-
  b/drivers/infiniband/core/ucm.c |8 
  b/drivers/infiniband/core/ucma.c|2 
  b/drivers/infiniband/core/ud_header.c   |  111 ++
  b/drivers/infiniband/core/user_mad.c|6 
  b/drivers/infiniband/core/uverbs.h  |1 
  b/drivers/infiniband/core/uverbs_cmd.c  |   32 ++
  b/drivers/infiniband/core/uverbs_main.c |1 
  b/drivers/infiniband/core/verbs.c   |   25 ++
  b/drivers/infiniband/hw/mlx4/ah.c   |  187 +---
  b/drivers/infiniband/hw/mlx4/mad.c  |   32 +-
  b/drivers/infiniband/hw/mlx4/main.c |  309 
 +---
  b/drivers/infiniband/hw/mlx4/mlx4_ib.h  |   19 +
  b/drivers/infiniband/hw/mlx4/qp.c   |  172 ++-
  b/drivers/infiniband/ulp/ipoib/ipoib_main.c |   12 -
  b/drivers/net/mlx4/en_main.c|   15 +
  b/drivers/net/mlx4/en_port.c|4 
  b/drivers/net/mlx4/en_port.h|3 
  b/drivers/net/mlx4/fw.c |3 
  b/drivers/net/mlx4/intf.c

[ofa-general] [PATCHv5 01/10] ib_core: Refine device personality from node type to port type

2009-08-19 Thread Eli Cohen

As a preparation to devices that, in general, support a different transport
protocol for each port, specifically RDMAoE, this patch defines a transport
type for each of a device's ports. As a result, rdma_node_get_transport() has
been unexported and is used internally by the implementation of the new API,
rdma_port_get_transport(), which gives the transport protocol of the queried
port. rdma_is_transport_supported() is also added to be used for verifying if a
given device supports a given protocol on any of its ports. All references to
rdma_node_get_transport() are changed to to use the new APIs. Also,
ib_port_attr is extended to contain enum rdma_transport_type.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---

Changes from previous version:
Define and make use of rdma_is_transport_supported(), an API that
allows the caller to check if a given device supports a given
transport protocol on any of its ports.


 drivers/infiniband/core/cm.c  |   25 +
 drivers/infiniband/core/cma.c |   54 +++--
 drivers/infiniband/core/mad.c |   41 ++
 drivers/infiniband/core/multicast.c   |4 +-
 drivers/infiniband/core/sa_query.c|   39 +
 drivers/infiniband/core/ucm.c |8 +++-
 drivers/infiniband/core/ucma.c|2 +-
 drivers/infiniband/core/user_mad.c|6 +++-
 drivers/infiniband/core/verbs.c   |   25 -
 drivers/infiniband/ulp/ipoib/ipoib_main.c |   12 +++---
 include/rdma/ib_verbs.h   |   11 --
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c   |3 +-
 net/sunrpc/xprtrdma/svc_rdma_transport.c  |2 +-
 13 files changed, 148 insertions(+), 84 deletions(-)

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 5130fc5..d082f59 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -3678,8 +3678,9 @@ static void cm_add_one(struct ib_device *ib_device)
unsigned long flags;
int ret;
u8 i;
+   enum rdma_transport_type tt;
 
-   if (rdma_node_get_transport(ib_device-node_type) != RDMA_TRANSPORT_IB)
+   if (!rdma_is_transport_supported(ib_device, RDMA_TRANSPORT_IB))
return;
 
cm_dev = kzalloc(sizeof(*cm_dev) + sizeof(*port) *
@@ -3700,6 +3701,10 @@ static void cm_add_one(struct ib_device *ib_device)
 
set_bit(IB_MGMT_METHOD_SEND, reg_req.method_mask);
for (i = 1; i = ib_device-phys_port_cnt; i++) {
+   tt = rdma_port_get_transport(ib_device, i);
+   if (tt != RDMA_TRANSPORT_IB)
+   continue;
+
port = kzalloc(sizeof *port, GFP_KERNEL);
if (!port)
goto error1;
@@ -3742,9 +3747,11 @@ error1:
port_modify.clr_port_cap_mask = IB_PORT_CM_SUP;
while (--i) {
port = cm_dev-port[i-1];
-   ib_modify_port(ib_device, port-port_num, 0, port_modify);
-   ib_unregister_mad_agent(port-mad_agent);
-   cm_remove_port_fs(port);
+   if (port) {
+   ib_modify_port(ib_device, port-port_num, 0, 
port_modify);
+   ib_unregister_mad_agent(port-mad_agent);
+   cm_remove_port_fs(port);
+   }
}
device_unregister(cm_dev-device);
kfree(cm_dev);
@@ -3770,10 +3777,12 @@ static void cm_remove_one(struct ib_device *ib_device)
 
for (i = 1; i = ib_device-phys_port_cnt; i++) {
port = cm_dev-port[i-1];
-   ib_modify_port(ib_device, port-port_num, 0, port_modify);
-   ib_unregister_mad_agent(port-mad_agent);
-   flush_workqueue(cm.wq);
-   cm_remove_port_fs(port);
+   if (port) {
+   ib_modify_port(ib_device, port-port_num, 0, 
port_modify);
+   ib_unregister_mad_agent(port-mad_agent);
+   flush_workqueue(cm.wq);
+   cm_remove_port_fs(port);
+   }
}
device_unregister(cm_dev-device);
kfree(cm_dev);
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 851de83..02fd045 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -329,24 +329,26 @@ static int cma_acquire_dev(struct rdma_id_private 
*id_priv)
struct cma_device *cma_dev;
union ib_gid gid;
int ret = -ENODEV;
-
-   switch (rdma_node_get_transport(dev_addr-dev_type)) {
-   case RDMA_TRANSPORT_IB:
-   ib_addr_get_sgid(dev_addr, gid);
-   break;
-   case RDMA_TRANSPORT_IWARP:
-   iw_addr_get_sgid(dev_addr, gid);
-   break;
-   default:
-   return -ENODEV;
-   }
+   int port;
 
list_for_each_entry(cma_dev, dev_list, list) {
-   ret

[ofa-general] [PATCHv5 03/10] ib_core: RDMAoE support only QP1

2009-08-19 Thread Eli Cohen

Since RDMAoE is using Ethernet as its link layer, there is no need for QP0. QP1
is still needed since it handles communications between CM agents. This patch
will create only QP1 for RDMAoE ports.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
Changes from previous version:

1. Instead of returning NULL for unsupported ports (which is no
considered an error), now callers of ib_register_mad_agent() must
verify that the port/special QP is supported before calling this
function.
2. Make use of rdma_is_transport_supported() where appropriate.


 drivers/infiniband/core/agent.c |   38 +-
 drivers/infiniband/core/mad.c   |   37 +
 2 files changed, 54 insertions(+), 21 deletions(-)

diff --git a/drivers/infiniband/core/agent.c b/drivers/infiniband/core/agent.c
index ae7c288..c130a4a 100644
--- a/drivers/infiniband/core/agent.c
+++ b/drivers/infiniband/core/agent.c
@@ -48,6 +48,8 @@
 struct ib_agent_port_private {
struct list_head port_list;
struct ib_mad_agent *agent[2];
+   struct ib_device*device;
+   u8   port_num;
 };
 
 static DEFINE_SPINLOCK(ib_agent_port_list_lock);
@@ -58,11 +60,10 @@ __ib_get_agent_port(struct ib_device *device, int port_num)
 {
struct ib_agent_port_private *entry;
 
-   list_for_each_entry(entry, ib_agent_port_list, port_list) {
-   if (entry-agent[0]-device == device 
-   entry-agent[0]-port_num == port_num)
+   list_for_each_entry(entry, ib_agent_port_list, port_list)
+   if (entry-device == device  entry-port_num == port_num)
return entry;
-   }
+
return NULL;
 }
 
@@ -146,6 +147,7 @@ int ib_agent_port_open(struct ib_device *device, int 
port_num)
struct ib_agent_port_private *port_priv;
unsigned long flags;
int ret;
+   enum rdma_transport_type tt;
 
/* Create new device info */
port_priv = kzalloc(sizeof *port_priv, GFP_KERNEL);
@@ -155,14 +157,17 @@ int ib_agent_port_open(struct ib_device *device, int 
port_num)
goto error1;
}
 
-   /* Obtain send only MAD agent for SMI QP */
-   port_priv-agent[0] = ib_register_mad_agent(device, port_num,
-   IB_QPT_SMI, NULL, 0,
-   agent_send_handler,
-   NULL, NULL);
-   if (IS_ERR(port_priv-agent[0])) {
-   ret = PTR_ERR(port_priv-agent[0]);
-   goto error2;
+   tt = rdma_port_get_transport(device, port_num);
+   if (tt == RDMA_TRANSPORT_IB) {
+   /* Obtain send only MAD agent for SMI QP */
+   port_priv-agent[0] = ib_register_mad_agent(device, port_num,
+   IB_QPT_SMI, NULL, 0,
+   agent_send_handler,
+   NULL, NULL);
+   if (IS_ERR(port_priv-agent[0])) {
+   ret = PTR_ERR(port_priv-agent[0]);
+   goto error2;
+   }
}
 
/* Obtain send only MAD agent for GSI QP */
@@ -175,6 +180,9 @@ int ib_agent_port_open(struct ib_device *device, int 
port_num)
goto error3;
}
 
+   port_priv-device = device;
+   port_priv-port_num = port_num;
+
spin_lock_irqsave(ib_agent_port_list_lock, flags);
list_add_tail(port_priv-port_list, ib_agent_port_list);
spin_unlock_irqrestore(ib_agent_port_list_lock, flags);
@@ -182,7 +190,8 @@ int ib_agent_port_open(struct ib_device *device, int 
port_num)
return 0;
 
 error3:
-   ib_unregister_mad_agent(port_priv-agent[0]);
+   if (tt == RDMA_TRANSPORT_IB)
+   ib_unregister_mad_agent(port_priv-agent[0]);
 error2:
kfree(port_priv);
 error1:
@@ -194,6 +203,9 @@ int ib_agent_port_close(struct ib_device *device, int 
port_num)
struct ib_agent_port_private *port_priv;
unsigned long flags;
 
+   if (rdma_port_get_transport(device, port_num) != RDMA_TRANSPORT_IB)
+   return 0;
+
spin_lock_irqsave(ib_agent_port_list_lock, flags);
port_priv = __ib_get_agent_port(device, port_num);
if (port_priv == NULL) {
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index c06117c..aceae79 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -2602,6 +2602,9 @@ static void cleanup_recv_queue(struct ib_mad_qp_info 
*qp_info)
struct ib_mad_private *recv;
struct ib_mad_list_head *mad_list;
 
+   if (!qp_info-qp)
+   return;
+
while (!list_empty(qp_info-recv_queue.list)) {
 
mad_list = list_entry(qp_info-recv_queue.list.next,
@@ -2643,6 +2646,9 @@ static int

[ofa-general] [PATCHv5 04/10] IB/umad: Enable support only for IB ports

2009-08-19 Thread Eli Cohen

Initialize umad context for devices that have any of their ports IB. Since
devices may have ports of two different protocols (for example,
RDMA_TRANSPORT_IB and RDMA_TRANSPORT_RDMAOE), ib_umad_add_one() needs to
succeed if any of the ports is IB but ib_umad_init_port() is called only for IB
ports.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---

Changes from last version:
1. Patch title changed from to Enable support for RDMAoE ports to
Enable support only for IB ports.
2. Do not allow userspace MADs to RDMAoE ports.


 drivers/infiniband/core/user_mad.c |   15 +++
 1 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/drivers/infiniband/core/user_mad.c 
b/drivers/infiniband/core/user_mad.c
index aa4eeb3..51888eb 100644
--- a/drivers/infiniband/core/user_mad.c
+++ b/drivers/infiniband/core/user_mad.c
@@ -1123,10 +1123,6 @@ static void ib_umad_add_one(struct ib_device *device)
e = device-phys_port_cnt;
}
 
-   for (i = s; i = e; ++i)
-   if (rdma_port_get_transport(device, i) != RDMA_TRANSPORT_IB)
-   return;
-
umad_dev = kzalloc(sizeof *umad_dev +
   (e - s + 1) * sizeof (struct ib_umad_port),
   GFP_KERNEL);
@@ -1141,8 +1137,9 @@ static void ib_umad_add_one(struct ib_device *device)
for (i = s; i = e; ++i) {
umad_dev-port[i - s].umad_dev = umad_dev;
 
-   if (ib_umad_init_port(device, i, umad_dev-port[i - s]))
-   goto err;
+   if (rdma_port_get_transport(device, i) == RDMA_TRANSPORT_IB)
+   if (ib_umad_init_port(device, i, umad_dev-port[i - 
s]))
+   goto err;
}
 
ib_set_client_data(device, umad_client, umad_dev);
@@ -1151,7 +1148,8 @@ static void ib_umad_add_one(struct ib_device *device)
 
 err:
while (--i = s)
-   ib_umad_kill_port(umad_dev-port[i - s]);
+   if (rdma_port_get_transport(device, i) == RDMA_TRANSPORT_IB)
+   ib_umad_kill_port(umad_dev-port[i - s]);
 
kref_put(umad_dev-ref, ib_umad_release_dev);
 }
@@ -1165,7 +1163,8 @@ static void ib_umad_remove_one(struct ib_device *device)
return;
 
for (i = 0; i = umad_dev-end_port - umad_dev-start_port; ++i)
-   ib_umad_kill_port(umad_dev-port[i]);
+   if (rdma_port_get_transport(device, i + 1) == RDMA_TRANSPORT_IB)
+   ib_umad_kill_port(umad_dev-port[i]);
 
kref_put(umad_dev-ref, ib_umad_release_dev);
 }
-- 
1.6.4

___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

[ofa-general] [PATCHv5 05/10] ib/cm: Enable CM support for RDMAoE

2009-08-19 Thread Eli Cohen

CM messages can be transported on RDMAoE protocol ports so they are enabled
here.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---

Changes from last version:
Make use of rdma_is_transport_supported()


 drivers/infiniband/core/cm.c  |5 +++--
 drivers/infiniband/core/ucm.c |   12 +---
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index d082f59..c9f9122 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -3680,7 +3680,8 @@ static void cm_add_one(struct ib_device *ib_device)
u8 i;
enum rdma_transport_type tt;
 
-   if (!rdma_is_transport_supported(ib_device, RDMA_TRANSPORT_IB))
+   if (!rdma_is_transport_supported(ib_device, RDMA_TRANSPORT_IB) 
+   !rdma_is_transport_supported(ib_device, RDMA_TRANSPORT_RDMAOE))
return;
 
cm_dev = kzalloc(sizeof(*cm_dev) + sizeof(*port) *
@@ -3702,7 +3703,7 @@ static void cm_add_one(struct ib_device *ib_device)
set_bit(IB_MGMT_METHOD_SEND, reg_req.method_mask);
for (i = 1; i = ib_device-phys_port_cnt; i++) {
tt = rdma_port_get_transport(ib_device, i);
-   if (tt != RDMA_TRANSPORT_IB)
+   if (tt != RDMA_TRANSPORT_IB  tt != RDMA_TRANSPORT_RDMAOE)
continue;
 
port = kzalloc(sizeof *port, GFP_KERNEL);
diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c
index b508020..3ce5df2 100644
--- a/drivers/infiniband/core/ucm.c
+++ b/drivers/infiniband/core/ucm.c
@@ -1240,13 +1240,19 @@ static void ib_ucm_add_one(struct ib_device *device)
 {
struct ib_ucm_device *ucm_dev;
int i;
+   enum rdma_transport_type tt;
 
if (!device-alloc_ucontext)
return;
 
-   for (i = 1; i = device-phys_port_cnt; ++i)
-   if (rdma_port_get_transport(device, i) != RDMA_TRANSPORT_IB)
-   return;
+   for (i = 1; i = device-phys_port_cnt; ++i) {
+   tt = rdma_port_get_transport(device, i);
+   if (tt == RDMA_TRANSPORT_IB || tt == RDMA_TRANSPORT_RDMAOE)
+   break;
+   }
+
+   if (i  device-phys_port_cnt)
+   return;
 
ucm_dev = kzalloc(sizeof *ucm_dev, GFP_KERNEL);
if (!ucm_dev)
-- 
1.6.4

___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

[ofa-general] [PATCHv5 06/10] ib_core: CMA device binding

2009-08-19 Thread Eli Cohen

Add support for RDMAoE device binding and IP -- GID resolution. Path resolving
and multicast joining are implemented within cma.c by filling the responses and
pushing the callbacks to the cma work queue. IP-GID resolution always yields
IPv6 link local addresses - remote GIDs are derived from the destination MAC
address of the remote port. Multicast GIDs are always mapped to multicast MACs
as is done in IPv6; addtion/removal of addresses are made by calling
dev_mc_add/delete thus causing the netedvice driver to update the corresponding
port's configuration. IPv4 multlicast is not supported currently. Some helper
functions are added to ib_addr.h.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---

Changes from last version:
1. Add kref to struct cma_multicast to aid in maintaining reference
count on the object. This is to avoid freeing the object while the
worker thread is still using it.
2. return an immediate error if we get an invalid mtu in a resolved
path
3. Don't fail resolve path if rate is 0 since this value stands for
IB_RATE_PORT_CURRENT.
4. In cma_rdmaoe_join_multicast(), fail immediately if mtu is zero.
5. Add ucma_copy_rdmaoe_route() to copy route to userspace instead of
modifying ucma_copy_ib_route().


 drivers/infiniband/core/cma.c  |  207 ++-
 drivers/infiniband/core/ucma.c |   31 ++
 include/rdma/ib_addr.h |   92 ++
 3 files changed, 324 insertions(+), 6 deletions(-)

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 02fd045..6e56e27 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -58,6 +58,7 @@ MODULE_LICENSE(Dual BSD/GPL);
 #define CMA_CM_RESPONSE_TIMEOUT 20
 #define CMA_MAX_CM_RETRIES 15
 #define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24)
+#define RDMAOE_PACKET_LIFETIME 18
 
 static void cma_add_one(struct ib_device *device);
 static void cma_remove_one(struct ib_device *device);
@@ -157,6 +158,7 @@ struct cma_multicast {
struct list_headlist;
void*context;
struct sockaddr_storage addr;
+   struct kref mcref;
 };
 
 struct cma_work {
@@ -173,6 +175,12 @@ struct cma_ndev_work {
struct rdma_cm_eventevent;
 };
 
+struct rdmaoe_mcast_work {
+   struct work_struct   work;
+   struct rdma_id_private  *id;
+   struct cma_multicast*mc;
+};
+
 union cma_ip_addr {
struct in6_addr ip6;
struct {
@@ -290,6 +298,20 @@ static inline void cma_deref_dev(struct cma_device 
*cma_dev)
complete(cma_dev-comp);
 }
 
+static inline void release_mc(struct kref *kref)
+{
+   struct cma_multicast *mc = container_of(kref, struct cma_multicast, 
mcref);
+   struct rdma_dev_addr *dev_addr = mc-id_priv-id.route.addr.dev_addr;
+   u8 mac[6];
+
+   rdma_get_mcast_mac((struct in6_addr *)(mc-multicast.ib-rec.mgid), 
mac);
+   rtnl_lock();
+   dev_mc_delete(dev_addr-src_dev, mac, 6, 0);
+   rtnl_unlock();
+   kfree(mc-multicast.ib);
+   kfree(mc);
+}
+
 static void cma_detach_from_dev(struct rdma_id_private *id_priv)
 {
list_del(id_priv-list);
@@ -340,6 +362,9 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv)
case RDMA_TRANSPORT_IWARP:
iw_addr_get_sgid(dev_addr, gid);
break;
+   case RDMA_TRANSPORT_RDMAOE:
+   rdmaoe_addr_get_sgid(dev_addr, gid);
+   break;
default:
return -ENODEV;
}
@@ -568,10 +593,16 @@ static int cma_ib_init_qp_attr(struct rdma_id_private 
*id_priv,
 {
struct rdma_dev_addr *dev_addr = id_priv-id.route.addr.dev_addr;
int ret;
+   u16 pkey;
+
+   if (rdma_port_get_transport(id_priv-id.device, id_priv-id.port_num) ==
+   RDMA_TRANSPORT_IB)
+   pkey = ib_addr_get_pkey(dev_addr);
+   else
+   pkey = 0x;
 
ret = ib_find_cached_pkey(id_priv-id.device, id_priv-id.port_num,
- ib_addr_get_pkey(dev_addr),
- qp_attr-pkey_index);
+ pkey, qp_attr-pkey_index);
if (ret)
return ret;
 
@@ -601,6 +632,7 @@ int rdma_init_qp_attr(struct rdma_cm_id *id, struct 
ib_qp_attr *qp_attr,
id_priv = container_of(id, struct rdma_id_private, id);
switch (rdma_port_get_transport(id_priv-id.device, 
id_priv-id.port_num)) {
case RDMA_TRANSPORT_IB:
+   case RDMA_TRANSPORT_RDMAOE:
if (!id_priv-cm_id.ib || cma_is_ud_ps(id_priv-id.ps))
ret = cma_ib_init_qp_attr(id_priv, qp_attr, 
qp_attr_mask);
else
@@ -828,8 +860,17 @@ static void cma_leave_mc_groups(struct rdma_id_private 
*id_priv)
mc

[ofa-general] [PATCHv5 09/10] mlx4: Add support for RDMAoE - address resolution

2009-08-19 Thread Eli Cohen

The following path handles address vectors creation for RDMAoE ports. mlx4
needs the MAC address of the remote node to include it in the WQE of a UD QP or
in the QP context of connected QPs. Address resolution is done atomically in
the case of a link local address or a multicast GID and otherwise -EINVAL is
returned.  mlx4 transport packets were changed too to accomodate for RDMAoE.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
Changes from previous version:
Call ib_register_mad_agent() for RDMA_TRANSPORT_IB type ports.


 drivers/infiniband/hw/mlx4/ah.c  |  187 --
 drivers/infiniband/hw/mlx4/mad.c |   32 --
 drivers/infiniband/hw/mlx4/mlx4_ib.h |   19 +++-
 drivers/infiniband/hw/mlx4/qp.c  |  172 +--
 drivers/net/mlx4/fw.c|3 +-
 include/linux/mlx4/device.h  |   31 ++-
 include/linux/mlx4/qp.h  |8 +-
 7 files changed, 347 insertions(+), 105 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c
index c75ac94..0a015c3 100644
--- a/drivers/infiniband/hw/mlx4/ah.c
+++ b/drivers/infiniband/hw/mlx4/ah.c
@@ -31,63 +31,166 @@
  */
 
 #include mlx4_ib.h
+#include rdma/ib_addr.h
+#include linux/inet.h
+#include linux/string.h
 
-struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
+int mlx4_ib_resolve_grh(struct mlx4_ib_dev *dev, const struct ib_ah_attr 
*ah_attr,
+   u8 *mac, int *is_mcast)
 {
-   struct mlx4_dev *dev = to_mdev(pd-device)-dev;
-   struct mlx4_ib_ah *ah;
+   struct mlx4_ib_rdmaoe *rdmaoe = dev-rdmaoe;
+   struct sockaddr_in6 s6 = {0};
+   struct net_device *netdev;
+   int ifidx;
 
-   ah = kmalloc(sizeof *ah, GFP_ATOMIC);
-   if (!ah)
-   return ERR_PTR(-ENOMEM);
+   *is_mcast = 0;
+   spin_lock(rdmaoe-lock);
+   netdev = rdmaoe-netdevs[ah_attr-port_num - 1];
+   if (!netdev) {
+   spin_unlock(rdmaoe-lock);
+   return -EINVAL;
+   }
+   ifidx = netdev-ifindex;
+   spin_unlock(rdmaoe-lock);
 
-   memset(ah-av, 0, sizeof ah-av);
+   memcpy(s6.sin6_addr.s6_addr, ah_attr-grh.dgid.raw, sizeof 
ah_attr-grh);
+   s6.sin6_family = AF_INET6;
+   s6.sin6_scope_id = ifidx;
+   if (rdma_link_local_addr(s6.sin6_addr))
+   rdma_get_ll_mac(s6.sin6_addr, mac);
+   else if (rdma_is_multicast_addr(s6.sin6_addr)) {
+   rdma_get_mcast_mac(s6.sin6_addr, mac);
+   *is_mcast = 1;
+   } else
+   return -EINVAL;
 
-   ah-av.port_pd = cpu_to_be32(to_mpd(pd)-pdn | (ah_attr-port_num  
24));
-   ah-av.g_slid  = ah_attr-src_path_bits;
-   ah-av.dlid= cpu_to_be16(ah_attr-dlid);
-   if (ah_attr-static_rate) {
-   ah-av.stat_rate = ah_attr-static_rate + MLX4_STAT_RATE_OFFSET;
-   while (ah-av.stat_rate  IB_RATE_2_5_GBPS + 
MLX4_STAT_RATE_OFFSET 
-  !(1  ah-av.stat_rate  dev-caps.stat_rate_support))
-   --ah-av.stat_rate;
-   }
-   ah-av.sl_tclass_flowlabel = cpu_to_be32(ah_attr-sl  28);
+   return 0;
+}
+
+static struct ib_ah *create_ib_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr,
+ struct mlx4_ib_ah *ah)
+{
+   struct mlx4_dev *dev = to_mdev(pd-device)-dev;
+
+   ah-av.ib.port_pd = cpu_to_be32(to_mpd(pd)-pdn | (ah_attr-port_num  
24));
+   ah-av.ib.g_slid  = ah_attr-src_path_bits;
if (ah_attr-ah_flags  IB_AH_GRH) {
-   ah-av.g_slid   |= 0x80;
-   ah-av.gid_index = ah_attr-grh.sgid_index;
-   ah-av.hop_limit = ah_attr-grh.hop_limit;
-   ah-av.sl_tclass_flowlabel |=
+   ah-av.ib.g_slid   |= 0x80;
+   ah-av.ib.gid_index = ah_attr-grh.sgid_index;
+   ah-av.ib.hop_limit = ah_attr-grh.hop_limit;
+   ah-av.ib.sl_tclass_flowlabel |=
cpu_to_be32((ah_attr-grh.traffic_class  20) |
ah_attr-grh.flow_label);
-   memcpy(ah-av.dgid, ah_attr-grh.dgid.raw, 16);
+   memcpy(ah-av.ib.dgid, ah_attr-grh.dgid.raw, 16);
+   }
+
+   ah-av.ib.dlid= cpu_to_be16(ah_attr-dlid);
+   if (ah_attr-static_rate) {
+   ah-av.ib.stat_rate = ah_attr-static_rate + 
MLX4_STAT_RATE_OFFSET;
+   while (ah-av.ib.stat_rate  IB_RATE_2_5_GBPS + 
MLX4_STAT_RATE_OFFSET 
+  !(1  ah-av.ib.stat_rate  
dev-caps.stat_rate_support))
+   --ah-av.ib.stat_rate;
}
+   ah-av.ib.sl_tclass_flowlabel = cpu_to_be32(ah_attr-sl  28);
 
return ah-ibah;
 }
 
+static struct ib_ah *create_rdmaoe_ah(struct ib_pd *pd, struct ib_ah_attr 
*ah_attr,
+  struct mlx4_ib_ah *ah)
+{
+   struct mlx4_ib_dev *ibdev = to_mdev(pd-device);
+   struct mlx4_dev *dev = ibdev

[ofa-general] [PATCHv5 0/10] RDMAoE support

2009-08-19 Thread Eli Cohen

RDMA over Ethernet (RDMAoE) allows running the IB transport protocol using
Ethernet frames, enabling the deployment of IB semantics on lossless Ethernet
fabrics. RDMAoE packets are standard Ethernet frames with an IEEE assigned
Ethertype, a GRH, unmodified IB transport headers and payload.  IB subnet
management and SA services are not required for RDMAoE operation; Ethernet
management practices are used instead. RDMAoE encodes IP addresses into its
GIDs and resolves MAC addresses using the host IP stack. For multicast GIDs,
standard IP to MAC mappings apply.

To support RDMAoE, a new transport protocol was added to the IB core. An RDMA
device can have ports with different transports, which are identified by a port
transport attribute.  The RDMA Verbs API is syntactically unmodified. When
referring to RDMAoE ports, Address handles are required to contain GIDs while
LID fields are ignored. The Ethernet L2 information is subsequently obtained by
the vendor-specific driver (both in kernel- and user-space) while modifying QPs
to RTR and creating address handles.  As there is no SA in RDMAoE, the CMA code
is modified to fill the necessary path record attributes locally before sending
CM packets. Similarly, the CMA provides to the user the required address handle
attributes when processing SIDR requests and joining multicast groups.

In this patch set, an RDMAoE port is currently assigned a single GID, encoding
the IPv6 link-local address of the corresponding netdev; the CMA RDMAoE code
temporarily uses IPv6 link-local addresses as GIDs instead of the IP address
provided by the user, thereby supporting any IP address.

To enable RDMAoE with the mlx4 driver stack, both the mlx4_en and mlx4_ib
drivers must be loaded, and the netdevice for the corresponding RDMAoE port
must be running. Individual ports of a multi port HCA can be independently
configured as Ethernet (with support for RDMAoE) or IB, as is already the case.
We have successfully tested MPI, SDP, RDS, and native Verbs applications over
RDMAoE.

Following is a series of 10 patches based on version 2.6.30 of the Linux
kernel. This new series reflects changes based on feedback from the community
on the previous set of patches, and is tagged v5.

Changes from v4:
1. Added rdma_is_transport_supported() and used it to simplify conditionals
throughout the code.
2. ib_register_mad_agent()for QP0 is only called for IB ports 3. PATCH 5/10
changed from Enable support for RDMAoE ports to Enable support only for IB
ports.
4. MAD services from userspace currently not supported for RDMAoE ports.
5. Add kref to struct cma_multicast to aid in maintaining reference count on
the object. This is to avoid freeing the object while the worker thread is
still using it.
6. Return immediate error for invalid MTU when resolving an RDMAoE path 7.
Don't fail resolve path if rate is 0 since this value stands for
IB_RATE_PORT_CURRENT.
8. In cma_rdmaoe_join_multicast(), fail immediately if mtu is zero.
9. Add ucma_copy_rdmaoe_route()instead of modifying ucma_copy_ib_route().
10. Bug fix: in PATCH 10/10, call flush_workqueue after unregistering netdev
notifiers
11. Multicast no longer use the broadcast MAC.
12. No changes to patches 2, 7 and 8 from the v4 series.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---

 b/drivers/infiniband/core/agent.c   |   38 ++-
 b/drivers/infiniband/core/cm.c  |   25 +-
 b/drivers/infiniband/core/cma.c |   54 ++--
 b/drivers/infiniband/core/mad.c |   41 ++-
 b/drivers/infiniband/core/multicast.c   |4 
 b/drivers/infiniband/core/sa_query.c|   39 ++-
 b/drivers/infiniband/core/ucm.c |8 
 b/drivers/infiniband/core/ucma.c|2 
 b/drivers/infiniband/core/ud_header.c   |  111 ++
 b/drivers/infiniband/core/user_mad.c|6 
 b/drivers/infiniband/core/uverbs.h  |1 
 b/drivers/infiniband/core/uverbs_cmd.c  |   32 ++
 b/drivers/infiniband/core/uverbs_main.c |1 
 b/drivers/infiniband/core/verbs.c   |   25 ++
 b/drivers/infiniband/hw/mlx4/ah.c   |  187 +---
 b/drivers/infiniband/hw/mlx4/mad.c  |   32 +-
 b/drivers/infiniband/hw/mlx4/main.c |  309 +---
 b/drivers/infiniband/hw/mlx4/mlx4_ib.h  |   19 +
 b/drivers/infiniband/hw/mlx4/qp.c   |  172 ++-
 b/drivers/infiniband/ulp/ipoib/ipoib_main.c |   12 -
 b/drivers/net/mlx4/en_main.c|   15 +
 b/drivers/net/mlx4/en_port.c|4 
 b/drivers/net/mlx4/en_port.h|3 
 b/drivers/net/mlx4/fw.c |3 
 b/drivers/net/mlx4/intf.c   |   20 +
 b/drivers/net/mlx4/main.c   |6 
 b/drivers/net/mlx4/mlx4.h   |1 
 b/include/linux/mlx4/cmd.h  |1 
 b/include/linux/mlx4/device.h   |   31 ++
 b/include/linux/mlx4/driver.h   |   16 +
 b/include/linux

Re: [ofa-general] mlx4: device driver tries to sync DMA memory it has not allocated

2009-08-12 Thread Eli Cohen

On Mon, Aug 10, 2009 at 01:30:44PM -0700, Roland Dreier wrote:
 
   Looking at mlx4_write_mtt_chunk() I see that it calls
   mlx4_table_find() with a pointer to single dma_addr_t - dma_handle -
   while the dma addresses for the ICM memory is actually a list of
   different addresses covering possibly different sizes. I think
   mlx4_table_find() should be changed to support that, and then we can
   use calls to dma_sync_single_for_cpu()/dma_sync_single_for_device()
   with the correct dma addresses.
 
 No, I think we're careful that we write MTT ranges that don't cross a
 page so there shouldn't be any problem.

But a contiguous ICM memory does not map, in general, to a contiguous
DMA memory, so if dma_sync_single_for_*() does not harm anything
than it does not do anything useful either.
___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Re: [ofa-general] mlx4: device driver tries to sync DMA memory it has not allocated

2009-08-12 Thread Eli Cohen

On Tue, Aug 11, 2009 at 11:23:48PM -0700, Roland Dreier wrote:
 
 Maybe I'm missing your point, but mlx4_table_find() does go to some
 trouble to find the right DMA address for the object being looked up.
 Of course it could be buggy but I still don't see why we would need a
 list of DMA addresses when we know we are only going to sync part of one
 page?

Is this is always true? What if you allocated an ICM buffer that uses
none adjacent pages? In this case you would need more than one call to
dma_sync_single_for_*(), isn't it?

 
 I think the right thing to do to fix this is to switch to using
 map_single instead of map_sg, and then use dma_sync_single_range_for_xxx
 to sync the subset we care about.
 
  - R.
___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Re: [ofa-general] [PATCHv4 06/10] ib_core: CMA device binding

2009-08-12 Thread Eli Cohen

On Mon, Aug 10, 2009 at 12:31:19PM -0700, Sean Hefty wrote:
 @@ -576,10 +586,16 @@ static int cma_ib_init_qp_attr(struct rdma_id_private
 *id_priv,
  {
  struct rdma_dev_addr *dev_addr = id_priv-id.route.addr.dev_addr;
  int ret;
 +u16 pkey;
 +
 +if (rdma_port_get_transport(id_priv-id.device, 
 id_priv-id.port_num)
 ==
 
 nit: It looks like the if is indented by spaces, instead of a tab.

Will fix, thanks.

 
 +static int cma_resolve_rdmaoe_route(struct rdma_id_private *id_priv)
 +{
 +work-old_state = CMA_ROUTE_QUERY;
 +work-new_state = CMA_ROUTE_RESOLVED;
 +if (!route-path_rec-mtu || !route-path_rec-rate) {
 +work-event.event = RDMA_CM_EVENT_ROUTE_ERROR;
 +work-event.status = -1;
 
 Any reason not to fail immediately here and leave the id state unchanged?

No real reason. Immediate failure is just as good as a deffered one. I
will change that and also remove change the rule to ommit
!route-path_rec-rate.

 
 +} else {
 +work-event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
 +work-event.status = 0;
 +}
 +
 +queue_work(cma_wq, work-work);
 +
 +kfree(mw);
 +}
 +
 +static int cma_rdmaoe_join_multicast(struct rdma_id_private *id_priv,
 + struct cma_multicast *mc)
 +{
 +struct rdmaoe_mcast_work *work;
 +struct rdma_dev_addr *dev_addr = id_priv-id.route.addr.dev_addr;
 +
 +if (cma_zero_addr((struct sockaddr *)mc-addr))
 +return -EINVAL;
 +
 +work = kzalloc(sizeof *work, GFP_KERNEL);
 +if (!work)
 +return -ENOMEM;
 +
 +mc-multicast.ib = kzalloc(sizeof(struct ib_sa_multicast), GFP_KERNEL);
 +if (!mc-multicast.ib) {
 +kfree(work);
 +return -ENOMEM;
 +}
 
 nit: I'd prefer to goto a common cleanup area to make it easier to add changes
 in the future. 

Will change that.
 
 +
 +cma_set_mgid(id_priv, (struct sockaddr *)mc-addr, mc-multicast.ib-
 rec.mgid);
 +mc-multicast.ib-rec.pkey = cpu_to_be16(0x);
 +if (id_priv-id.ps == RDMA_PS_UDP)
 +mc-multicast.ib-rec.qkey = cpu_to_be32(RDMA_UDP_QKEY);
 +mc-multicast.ib-rec.rate = rdmaoe_get_rate(dev_addr-src_dev);
 +mc-multicast.ib-rec.hop_limit = 1;
 +mc-multicast.ib-rec.mtu = rdmaoe_get_mtu(dev_addr-src_dev-mtu);
 
 Do we need to check the rate/mtu here, like in resolve route?  Or should we be
 good since we could successfully resolve the route?  Actually, can we just 
 read
 the data from the path record that gets stored with the id?
 
I believe that querying the mtu again to get an up to date vlaue will
be better, plus adding a check for the mtu to be none zero or
returning immediately with -EINVAL

 +rdmaoe_addr_get_sgid(dev_addr, mc-multicast.ib-rec.port_gid);
 +work-id = id_priv;
 +work-mc = mc;
 +INIT_WORK(work-work, rdmaoe_mcast_work_handler);
 +
 +queue_work(cma_wq, work-work);
 +
 +return 0;
 +}
 +
  int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
  void *context)
  {
 @@ -2782,6 +2918,9 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct
 sockaddr *addr,
  case RDMA_TRANSPORT_IB:
  ret = cma_join_ib_multicast(id_priv, mc);
  break;
 +case RDMA_TRANSPORT_RDMAOE:
 +ret = cma_rdmaoe_join_multicast(id_priv, mc);
 +break;
  default:
  ret = -ENOSYS;
  break;
 @@ -2793,6 +2932,7 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct
 sockaddr *addr,
  spin_unlock_irq(id_priv-lock);
  kfree(mc);
  }
 +
  return ret;
  }
  EXPORT_SYMBOL(rdma_join_multicast);
 port_num)) {
 +tt = rdma_port_get_transport(ctx-cm_id-device, ctx-cm_id-port_num);
 +switch (tt) {
  case RDMA_TRANSPORT_IB:
 -ucma_copy_ib_route(resp, ctx-cm_id-route);
 +case RDMA_TRANSPORT_RDMAOE:
 +ucma_copy_ib_route(resp, ctx-cm_id-route, tt);
 
 It seems simpler to just add a new call ucma_copy_rdmaoe_route, rather than
 merging those two transports into a single copy function that then branches
 based on the transport.

Agree, will change.
 
  break;
  default:
  break;
 diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h
 index 483057b..66a848e 100644
 --- a/include/rdma/ib_addr.h
 +++ b/include/rdma/ib_addr.h
 @@ -39,6 +39,8 @@
  #include linux/netdevice.h
  #include linux/socket.h
  #include rdma/ib_verbs.h
 +#include linux/ethtool.h
 +#include rdma/ib_pack.h
 
  struct rdma_addr_client {
  atomic_t refcount;
 @@ -157,4 +159,89 @@ static inline void iw_addr_get_dgid(struct rdma_dev_addr
 *dev_addr,
  memcpy(gid, dev_addr-dst_dev_addr, sizeof *gid);
  }
 
 +

 +static inline int rdma_link_local_addr(struct in6_addr *addr)
 +{
 +if (addr-s6_addr32[0] == cpu_to_be32(0xfe80) 
 +addr-s6_addr32[1] == 0)
 +return 1;
 +else
 +return 0;
 +}
 
 just replace the 'if'

Re: [ofa-general] mlx4: device driver tries to sync DMA memory it has not allocated

2009-08-10 Thread Eli Cohen

Looking at mlx4_write_mtt_chunk() I see that it calls
mlx4_table_find() with a pointer to single dma_addr_t - dma_handle -
while the dma addresses for the ICM memory is actually a list of
different addresses covering possibly different sizes. I think
mlx4_table_find() should be changed to support that, and then we can
use calls to dma_sync_single_for_cpu()/dma_sync_single_for_device()
with the correct dma addresses.
Roland, what do you think?

On Sat, Aug 08, 2009 at 07:49:22PM +0200, Bart Van Assche wrote:
 Hello,
 
 Has anyone ever encountered a message like the one below ? This message was
 generated while booting a 2.6.30.4 kernel with CONFIG_DMA_API_DEBUG=y and
 before any out-of-tree kernel modules were loaded.
 
 [ cut here ]
 WARNING: at lib/dma-debug.c:635 check_sync+0x47c/0x4b0()
 Hardware name: P5Q DELUXE
 mlx4_core :01:00.0: DMA-API: device driver tries to sync DMA memory it
 has not allocated [device address=0x000139482000] [size=4096 bytes]
 Modules linked in: snd_hda_codec_atihdmi snd_hda_codec_analog snd_hda_intel
 snd_hda_codec snd_hwdep snd_pcm snd_timer snd rtc_cmos soundcore i2c_i801
 rtc_core hid_belkin mlx4_core(
 +) rtc_lib sr_mod sg snd_page_alloc pcspkr button intel_agp i2c_core joydev
 serio_raw cdrom usbhid hid raid456 raid6_pq async_xor async_memcpy async_tx
 xor raid0 sd_mod crc_t10dif
 ehci_hcd uhci_hcd usbcore edd raid1 ext3 mbcache jbd fan ide_pci_generic
 ide_core ata_generic ata_piix pata_marvell ahci libata scsi_mod thermal
 processor thermal_sys hwmon
 Pid: 1325, comm: work_for_cpu Not tainted 2.6.30.4-scst-debug #6
 Call Trace:
  [8039bc7c] ? check_sync+0x47c/0x4b0
  [80248b48] warn_slowpath_common+0x78/0xd0
  [80248bfc] warn_slowpath_fmt+0x3c/0x40
  [80517769] ? _spin_lock_irqsave+0x49/0x60
  [8039b8ab] ? check_sync+0xab/0x4b0
  [8039bc7c] check_sync+0x47c/0x4b0
  [802724ac] ? mark_held_locks+0x6c/0x90
  [8039be1d] debug_dma_sync_single_for_cpu+0x1d/0x20
  [a024a969] mlx4_write_mtt+0x159/0x1e0 [mlx4_core]
  [a0243c02] mlx4_create_eq+0x222/0x650 [mlx4_core]
  [8027281d] ? trace_hardirqs_on+0xd/0x10
  [a02441f5] mlx4_init_eq_table+0x1c5/0x4a0 [mlx4_core]
  [a0248b08] mlx4_setup_hca+0x98/0x550 [mlx4_core]
  [a0249891] ? __mlx4_init_one+0x8d1/0x920 [mlx4_core]
  [a0249331] __mlx4_init_one+0x371/0x920 [mlx4_core]
  [a024df18] mlx4_init_one+0x22/0x44 [mlx4_core]
  [8025cd90] ? do_work_for_cpu+0x0/0x30
  [803a43e2] local_pci_probe+0x12/0x20
  [8025cda3] do_work_for_cpu+0x13/0x30
  [802613e6] kthread+0x56/0x90
  [8020cffa] child_rip+0xa/0x20
  [8020c9c0] ? restore_args+0x0/0x30
  [80261390] ? kthread+0x0/0x90
  [8020cff0] ? child_rip+0x0/0x20
 ---[ end trace 4480af29bc755c6a ]---
 
 Bart.

 ___
 general mailing list
 general@lists.openfabrics.org
 http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general
 
 To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Re: [ofa-general] [PATCHv4 0/10] RDMAoE support

2009-08-10 Thread Eli Cohen

On Mon, Aug 10, 2009 at 09:45:58AM -0400, Hal Rosenstock wrote:
 
 How is port configuration (RDMAoE v. IB) accomplished ? Is it prior to boot
 time or dynamic ?
 

mlx4 allows changing port designation dynamically by writing to the
sysfs.
___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Re: [ofa-general] [PATCHv4 10/10] mlx4: Add RDMAoE support - allow interfaces to correspond to each other

2009-08-06 Thread Eli Cohen

On Wed, Aug 05, 2009 at 02:42:59PM -0600, Jason Gunthorpe wrote:
 
 What about multicast though? Switches are going to have trouble with
 group membership lists for non IP packets.. Even just sending a ICMPv6
 packet (with an IPv6 ethertype) isn't guaranteed to fix it.
 

In this patch set, all multicast packets use the broadcast mac. We
will address this issue at a future time.
___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Re: [ofa-general] [PATCHv4 01/10] ib_core: Refine device personality from node type to port type

2009-08-06 Thread Eli Cohen

On Wed, Aug 05, 2009 at 01:43:12PM -0700, Sean Hefty wrote:
 
 Can resources (PDs, CQs, MRs, etc.) between the different transports be 
 shared?
 Does QP failover between transports work?

There is nothing in the architecture that precludes this; we are not
currently focusing on this.

 
 Did you consider modifying rdma_node_get_transport_s_() and returning a 
 bitmask
 of the supported transports available on the device?  I'm wondering if 
 something
 like this makes sense, to allow skipping devices that are not of interest to a
 particular module.  This would be in addition to the rdma_port_get_transport
 call.
 
 There's just a lot of new checks to handle the transport on a port by port
 basis.
 

We can use a function: rdma_is_transport_supported(ibdev, transport),
which will return true if at least one port runs the given transport.
Thus, as long as we have only a few transports, these checks will
amount to 1-2 lines of code in each module.

___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Re: [ofa-general] [PATCHv4 01/10] ib_core: Refine device personality from node type to port type

2009-08-06 Thread Eli Cohen

On Thu, Aug 06, 2009 at 10:34:19AM -0700, Sean Hefty wrote:
 
 Does the implementation allow this?  Right now PDs, CQs, etc are allocated per
 device, not per port.  I'm not immediately concerned about QP failover.
 However, I believe there needs to be some level of coordination between the
 Infiniband side of the CM and the Ethernet side of the CM, since QPs are
 associated with CA GUIDs.  I'm just trying to understand the impact of this
 coordination.
 

There is nothing in the implementation to prevent it. We did not see a
reason to. The ports share a common node GUID but each one has its own
GIDs.
___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Re: [ofa-general] [PATCHv4 04/10] IB/umad: Enable support for RDMAoE ports

2009-08-06 Thread Eli Cohen

On Thu, Aug 06, 2009 at 11:05:47AM -0700, Sean Hefty wrote:
 
 Is there a need to expose QP1 to user space?  The CM is in the kernel, and
 there's not an SA.
 

Good point. There seems to be no reason to expose it. Will fix.
___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Re: [ofa-general] [PATCHv4 03/10] ib_core: RDMAoE support only QP1

2009-08-06 Thread Eli Cohen

On Thu, Aug 06, 2009 at 10:52:34AM -0700, Sean Hefty wrote:
 +/* Validate device and port */
 +port_priv = ib_get_mad_port(device, port_num);
 +if (!port_priv) {
 +ret = ERR_PTR(-ENODEV);
 +goto error1;
 +}
 +
 +if (!port_priv-qp_info[qp_type].qp)
 +return NULL;
 
 It seems odd that the first if has 'goto error1', but the second if simply
 returns NULL. 
 

The original intention was to release the caller from the need to
decide whether to register the mad agent or not and so the NULL
returned would not be treated as error. Thinking it over I realize
that it would be better to let the caller decide (according to the
port protocol) whether or not to register the mad agent. Will fix.


 @@ -556,6 +559,9 @@ int ib_unregister_mad_agent(struct ib_mad_agent 
 *mad_agent)
  struct ib_mad_agent_private *mad_agent_priv;
  struct ib_mad_snoop_private *mad_snoop_priv;
 
 +if (!mad_agent)
 +return 0;
 
 Why would a kernel client call ib_unregister_mad_agent with a NULL pointer?
 

Same as above. Goes away after the fix.
 
  cq_size = (IB_MAD_QP_SEND_SIZE + IB_MAD_QP_RECV_SIZE) * 2;
 +has_smi = rdma_port_get_transport(device, port_num) ==
 RDMA_TRANSPORT_IB;
 +if (has_smi)
 +cq_size *= 2;
 
 cq_size is doubled twice
 

This is a bug - I'll fix it  - thanks.

 I really wish there were a cleaner way to add this support that didn't involve
 adding so many checks throughout the code.  It's hard to know if checks were
 added in all the places that were needed.  I can't think of a clever way to
 handle QP 0.

The fix discussed above will eliminate a good portion of these checks.
___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

[ofa-general] [PATCHv4 0/10] RDMAoE support

2009-08-05 Thread Eli Cohen

RDMA over Ethernet (RDMAoE) allows running the IB transport protocol using
Ethernet frames, enabling the deployment of IB semantics on lossless Ethernet
fabrics. RDMAoE packets are standard Ethernet frames with an IEEE assigned
Ethertype, a GRH, unmodified IB transport headers and payload.  IB subnet
management and SA services are not required for RDMAoE operation; Ethernet
management practices are used instead. RDMAoE encodes IP addresses into its
GIDs and resolves MAC addresses using the host IP stack. For multicast GIDs,
standard IP to MAC mappings apply.

To support RDMAoE, a new transport protocol was added to the IB core. An RDMA
device can have ports with different transports, which are identified by a port
transport attribute.  The RDMA Verbs API is syntactically unmodified. When
referring to RDMAoE ports, Address handles are required to contain GIDs while
LID fields are ignored. The Ethernet L2 information is subsequently obtained by
the vendor-specific driver (both in kernel- and user-space) while modifying QPs
to RTR and creating address handles.  As there is no SA in RDMAoE, the CMA code
is modified to fill the necessary path record attributes locally before sending
CM packets. Similarly, the CMA provides to the user the required address handle
attributes when processing SIDR requests and joining multicast groups.

In this patch set, an RDMAoE port is currently assigned a single GID, encoding
the IPv6 link-local address of the corresponding netdev; the CMA RDMAoE code
temporarily uses IPv6 link-local addresses as GIDs instead of the IP address
provided by the user, thereby supporting any IP address. In addition, multicast
packets currently use the broadcast MAC.

To enable RDMAoE with the mlx4 driver stack, both the mlx4_en and mlx4_ib
drivers must be loaded, and the netdevice for the corresponding RDMAoE port
must be running. Individual ports of a multi port HCA can be independently
configured as Ethernet (with support for RDMAoE) or IB, as is already the case.
We have successfully tested MPI, SDP, RDS, and native Verbs applications over
RDMAoE.

Following is a series of 10 patches based on version 2.6.30 of the Linux
kernel. This new series reflects changes based on feedback from the community
on the previous set of patches, and is tagged v4.

Changes from v3:
1. RDMA transport is determined on a per-port basis instead of the link-type
notion.
2. SA services are not provided for RDMAoE clients. CMA code is modified to
support RDMAoE transport types.
3. For brevity, GID to MAC resolution is currently restricted to link local
addresses.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---

 b/drivers/infiniband/core/agent.c   |   12 -
 b/drivers/infiniband/core/cm.c  |   26 +-
 b/drivers/infiniband/core/cma.c |   54 ++--
 b/drivers/infiniband/core/mad.c |   42 ++-
 b/drivers/infiniband/core/multicast.c   |5 
 b/drivers/infiniband/core/sa_query.c|   40 ++-
 b/drivers/infiniband/core/ucm.c |8 
 b/drivers/infiniband/core/ucma.c|2 
 b/drivers/infiniband/core/ud_header.c   |  111 ++
 b/drivers/infiniband/core/user_mad.c|7 
 b/drivers/infiniband/core/uverbs.h  |1 
 b/drivers/infiniband/core/uverbs_cmd.c  |   32 ++
 b/drivers/infiniband/core/uverbs_main.c |1 
 b/drivers/infiniband/core/verbs.c   |   12 -
 b/drivers/infiniband/hw/mlx4/ah.c   |  187 +---
 b/drivers/infiniband/hw/mlx4/main.c |  309 +---
 b/drivers/infiniband/hw/mlx4/mlx4_ib.h  |   19 +
 b/drivers/infiniband/hw/mlx4/qp.c   |  172 ++-
 b/drivers/infiniband/ulp/ipoib/ipoib_main.c |   12 -
 b/drivers/net/mlx4/en_main.c|   15 +
 b/drivers/net/mlx4/en_port.c|4 
 b/drivers/net/mlx4/en_port.h|3 
 b/drivers/net/mlx4/fw.c |3 
 b/drivers/net/mlx4/intf.c   |   20 +
 b/drivers/net/mlx4/main.c   |6 
 b/drivers/net/mlx4/mlx4.h   |1 
 b/include/linux/mlx4/cmd.h  |1 
 b/include/linux/mlx4/device.h   |   31 ++
 b/include/linux/mlx4/driver.h   |   16 +
 b/include/linux/mlx4/qp.h   |8 
 b/include/rdma/ib_addr.h|   87 +++
 b/include/rdma/ib_pack.h|   26 ++
 b/include/rdma/ib_user_verbs.h  |   21 +
 b/include/rdma/ib_verbs.h   |9 
 b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c   |3 
 b/net/sunrpc/xprtrdma/svc_rdma_transport.c  |2 
 drivers/infiniband/core/cm.c|2 
 drivers/infiniband/core/cma.c   |  150 +
 drivers/infiniband/core/mad.c   |   55 +++-
 drivers/infiniband/core/ucm.c   |   12 -
 drivers/infiniband/core/ucma.c  |   25 +-
 drivers/infiniband/core/user_mad.c  |   27

[ofa-general] [PATCHv4 01/10] ib_core: Refine device personality from node type to port type

2009-08-05 Thread Eli Cohen

As a preparation to devices that, in general, support different transport
protocol for each port, specifically RDMAoE, this patch defines transport type
for each of a device's ports. As a result rdma_node_get_transport() has been
unexported and is used internally by the implementation of the new API,
rdma_port_get_transport() which gives the transport protocol of the queried
port. All references to rdma_node_get_transport() are changed to to use
rdma_port_get_transport(). Also, ib_port_attr is extended to contain enum
rdma_transport_type.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/infiniband/core/cm.c  |   26 -
 drivers/infiniband/core/cma.c |   54 +++--
 drivers/infiniband/core/mad.c |   42 +-
 drivers/infiniband/core/multicast.c   |5 +--
 drivers/infiniband/core/sa_query.c|   40 -
 drivers/infiniband/core/ucm.c |8 +++-
 drivers/infiniband/core/ucma.c|2 +-
 drivers/infiniband/core/user_mad.c|7 ++--
 drivers/infiniband/core/verbs.c   |   12 +-
 drivers/infiniband/ulp/ipoib/ipoib_main.c |   12 +++---
 include/rdma/ib_verbs.h   |9 +++--
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c   |3 +-
 net/sunrpc/xprtrdma/svc_rdma_transport.c  |2 +-
 13 files changed, 128 insertions(+), 94 deletions(-)

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 5130fc5..f930f1d 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -3678,9 +3678,7 @@ static void cm_add_one(struct ib_device *ib_device)
unsigned long flags;
int ret;
u8 i;
-
-   if (rdma_node_get_transport(ib_device-node_type) != RDMA_TRANSPORT_IB)
-   return;
+   enum rdma_transport_type tt;
 
cm_dev = kzalloc(sizeof(*cm_dev) + sizeof(*port) *
 ib_device-phys_port_cnt, GFP_KERNEL);
@@ -3700,6 +3698,10 @@ static void cm_add_one(struct ib_device *ib_device)
 
set_bit(IB_MGMT_METHOD_SEND, reg_req.method_mask);
for (i = 1; i = ib_device-phys_port_cnt; i++) {
+   tt = rdma_port_get_transport(ib_device, i);
+   if (tt != RDMA_TRANSPORT_IB)
+   continue;
+
port = kzalloc(sizeof *port, GFP_KERNEL);
if (!port)
goto error1;
@@ -3742,9 +3744,11 @@ error1:
port_modify.clr_port_cap_mask = IB_PORT_CM_SUP;
while (--i) {
port = cm_dev-port[i-1];
-   ib_modify_port(ib_device, port-port_num, 0, port_modify);
-   ib_unregister_mad_agent(port-mad_agent);
-   cm_remove_port_fs(port);
+   if (port) {
+   ib_modify_port(ib_device, port-port_num, 0, 
port_modify);
+   ib_unregister_mad_agent(port-mad_agent);
+   cm_remove_port_fs(port);
+   }
}
device_unregister(cm_dev-device);
kfree(cm_dev);
@@ -3770,10 +3774,12 @@ static void cm_remove_one(struct ib_device *ib_device)
 
for (i = 1; i = ib_device-phys_port_cnt; i++) {
port = cm_dev-port[i-1];
-   ib_modify_port(ib_device, port-port_num, 0, port_modify);
-   ib_unregister_mad_agent(port-mad_agent);
-   flush_workqueue(cm.wq);
-   cm_remove_port_fs(port);
+   if (port) {
+   ib_modify_port(ib_device, port-port_num, 0, 
port_modify);
+   ib_unregister_mad_agent(port-mad_agent);
+   flush_workqueue(cm.wq);
+   cm_remove_port_fs(port);
+   }
}
device_unregister(cm_dev-device);
kfree(cm_dev);
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index aa62101..866ff7f 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -337,24 +337,26 @@ static int cma_acquire_dev(struct rdma_id_private 
*id_priv)
struct cma_device *cma_dev;
union ib_gid gid;
int ret = -ENODEV;
-
-   switch (rdma_node_get_transport(dev_addr-dev_type)) {
-   case RDMA_TRANSPORT_IB:
-   ib_addr_get_sgid(dev_addr, gid);
-   break;
-   case RDMA_TRANSPORT_IWARP:
-   iw_addr_get_sgid(dev_addr, gid);
-   break;
-   default:
-   return -ENODEV;
-   }
+   int port;
 
list_for_each_entry(cma_dev, dev_list, list) {
-   ret = ib_find_cached_gid(cma_dev-device, gid,
-id_priv-id.port_num, NULL);
-   if (!ret) {
-   cma_attach_to_dev(id_priv, cma_dev);
-   break;
+   for (port = 1; port = cma_dev-device-phys_port_cnt; ++port) {
+   switch (rdma_port_get_transport

[ofa-general] [PATCHv4 02/10] ib_core: Add RDMAoE transport protocol

2009-08-05 Thread Eli Cohen

Add a new transport protocol, RDMAoE, used for transporting Infiniband traffic
over Ethernet fabrics.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 include/rdma/ib_verbs.h |3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index b557129..4eec70f 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -69,7 +69,8 @@ enum rdma_node_type {
 
 enum rdma_transport_type {
RDMA_TRANSPORT_IB,
-   RDMA_TRANSPORT_IWARP
+   RDMA_TRANSPORT_IWARP,
+   RDMA_TRANSPORT_RDMAOE
 };
 
 enum ib_device_cap_flags {
-- 
1.6.3.3

___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

[ofa-general] [PATCHv4 03/10] ib_core: RDMAoE support only QP1

2009-08-05 Thread Eli Cohen

Since RDMAoE is using Ethernet as its link layer, there is no need for QP0. QP1
is still needed since it handles communications between CM agents. This patch
will create only QP1 for RDMAoE ports.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/infiniband/core/agent.c |   12 +---
 drivers/infiniband/core/mad.c   |   55 +--
 2 files changed, 49 insertions(+), 18 deletions(-)

diff --git a/drivers/infiniband/core/agent.c b/drivers/infiniband/core/agent.c
index ae7c288..c3f2048 100644
--- a/drivers/infiniband/core/agent.c
+++ b/drivers/infiniband/core/agent.c
@@ -48,6 +48,8 @@
 struct ib_agent_port_private {
struct list_head port_list;
struct ib_mad_agent *agent[2];
+   struct ib_device*device;
+   u8   port_num;
 };
 
 static DEFINE_SPINLOCK(ib_agent_port_list_lock);
@@ -58,11 +60,10 @@ __ib_get_agent_port(struct ib_device *device, int port_num)
 {
struct ib_agent_port_private *entry;
 
-   list_for_each_entry(entry, ib_agent_port_list, port_list) {
-   if (entry-agent[0]-device == device 
-   entry-agent[0]-port_num == port_num)
+   list_for_each_entry(entry, ib_agent_port_list, port_list)
+   if (entry-device == device  entry-port_num == port_num)
return entry;
-   }
+
return NULL;
 }
 
@@ -175,6 +176,9 @@ int ib_agent_port_open(struct ib_device *device, int 
port_num)
goto error3;
}
 
+   port_priv-device = device;
+   port_priv-port_num = port_num;
+
spin_lock_irqsave(ib_agent_port_list_lock, flags);
list_add_tail(port_priv-port_list, ib_agent_port_list);
spin_unlock_irqrestore(ib_agent_port_list_lock, flags);
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index 7b737c4..de83c71 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -199,6 +199,16 @@ struct ib_mad_agent *ib_register_mad_agent(struct 
ib_device *device,
unsigned long flags;
u8 mgmt_class, vclass;
 
+   /* Validate device and port */
+   port_priv = ib_get_mad_port(device, port_num);
+   if (!port_priv) {
+   ret = ERR_PTR(-ENODEV);
+   goto error1;
+   }
+
+   if (!port_priv-qp_info[qp_type].qp)
+   return NULL;
+
/* Validate parameters */
qpn = get_spl_qp_index(qp_type);
if (qpn == -1)
@@ -260,13 +270,6 @@ struct ib_mad_agent *ib_register_mad_agent(struct 
ib_device *device,
goto error1;
}
 
-   /* Validate device and port */
-   port_priv = ib_get_mad_port(device, port_num);
-   if (!port_priv) {
-   ret = ERR_PTR(-ENODEV);
-   goto error1;
-   }
-
/* Allocate structures */
mad_agent_priv = kzalloc(sizeof *mad_agent_priv, GFP_KERNEL);
if (!mad_agent_priv) {
@@ -556,6 +559,9 @@ int ib_unregister_mad_agent(struct ib_mad_agent *mad_agent)
struct ib_mad_agent_private *mad_agent_priv;
struct ib_mad_snoop_private *mad_snoop_priv;
 
+   if (!mad_agent)
+   return 0;
+
/* If the TID is zero, the agent can only snoop. */
if (mad_agent-hi_tid) {
mad_agent_priv = container_of(mad_agent,
@@ -2602,6 +2608,9 @@ static void cleanup_recv_queue(struct ib_mad_qp_info 
*qp_info)
struct ib_mad_private *recv;
struct ib_mad_list_head *mad_list;
 
+   if (!qp_info-qp)
+   return;
+
while (!list_empty(qp_info-recv_queue.list)) {
 
mad_list = list_entry(qp_info-recv_queue.list.next,
@@ -2643,6 +2652,9 @@ static int ib_mad_port_start(struct ib_mad_port_private 
*port_priv)
 
for (i = 0; i  IB_MAD_QPS_CORE; i++) {
qp = port_priv-qp_info[i].qp;
+   if (!qp)
+   continue;
+
/*
 * PKey index for QP1 is irrelevant but
 * one is needed for the Reset to Init transition
@@ -2684,6 +2696,9 @@ static int ib_mad_port_start(struct ib_mad_port_private 
*port_priv)
}
 
for (i = 0; i  IB_MAD_QPS_CORE; i++) {
+   if (!port_priv-qp_info[i].qp)
+   continue;
+
ret = ib_mad_post_receive_mads(port_priv-qp_info[i], NULL);
if (ret) {
printk(KERN_ERR PFX Couldn't post receive WRs\n);
@@ -2762,6 +2777,9 @@ error:
 
 static void destroy_mad_qp(struct ib_mad_qp_info *qp_info)
 {
+   if (!qp_info-qp)
+   return;
+
ib_destroy_qp(qp_info-qp);
kfree(qp_info-snoop_table);
 }
@@ -2777,6 +2795,7 @@ static int ib_mad_port_open(struct ib_device *device,
struct ib_mad_port_private *port_priv;
unsigned long flags;
char name[sizeof ib_mad123];
+   int has_smi;
 
/* Create new device info */
port_priv = kzalloc(sizeof

[ofa-general] [PATCHv4 05/10] ib/cm: Enable CM support for RDMAoE

2009-08-05 Thread Eli Cohen

CM messages can be transported on RDMAoE protocol ports so they are enabled
here.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/infiniband/core/cm.c  |2 +-
 drivers/infiniband/core/ucm.c |   12 +---
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index f930f1d..63d6de3 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -3699,7 +3699,7 @@ static void cm_add_one(struct ib_device *ib_device)
set_bit(IB_MGMT_METHOD_SEND, reg_req.method_mask);
for (i = 1; i = ib_device-phys_port_cnt; i++) {
tt = rdma_port_get_transport(ib_device, i);
-   if (tt != RDMA_TRANSPORT_IB)
+   if (tt != RDMA_TRANSPORT_IB  tt != RDMA_TRANSPORT_RDMAOE)
continue;
 
port = kzalloc(sizeof *port, GFP_KERNEL);
diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c
index 4f5096d..21c78f5 100644
--- a/drivers/infiniband/core/ucm.c
+++ b/drivers/infiniband/core/ucm.c
@@ -1240,13 +1240,19 @@ static void ib_ucm_add_one(struct ib_device *device)
 {
struct ib_ucm_device *ucm_dev;
int i;
+   enum rdma_transport_type tt;
 
if (!device-alloc_ucontext || device-node_type == RDMA_NODE_IB_SWITCH)
return;
 
-   for (i = 1; i = device-phys_port_cnt; ++i)
-   if (rdma_port_get_transport(device, i) != RDMA_TRANSPORT_IB)
-   return;
+   for (i = 1; i = device-phys_port_cnt; ++i) {
+   tt = rdma_port_get_transport(device, i);
+   if (tt == RDMA_TRANSPORT_IB || tt == RDMA_TRANSPORT_RDMAOE)
+   break;
+   }
+
+   if (i  device-phys_port_cnt)
+   return;
 
ucm_dev = kzalloc(sizeof *ucm_dev, GFP_KERNEL);
if (!ucm_dev)
-- 
1.6.3.3

___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

[ofa-general] [PATCHv4 06/10] ib_core: CMA device binding

2009-08-05 Thread Eli Cohen

Add support for RDMAoE device binding and IP -- GID resolution. Path resolving
and multicast joining are implemented within cma.c by filling the responses and
pushing the callbacks to the cma work queue. IP-GID resolution always yield
IPv6 link local addresses - remote GIDs are derived from the destination MAC
address of the remote port. Multicast GIDs are always mapped to broadcast MAC
(all FFs). Some helper functions are added to ib_addr.h.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/infiniband/core/cma.c  |  150 ++-
 drivers/infiniband/core/ucma.c |   25 +--
 include/rdma/ib_addr.h |   87 +++
 3 files changed, 251 insertions(+), 11 deletions(-)

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 866ff7f..8f5675b 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -58,6 +58,7 @@ MODULE_LICENSE(Dual BSD/GPL);
 #define CMA_CM_RESPONSE_TIMEOUT 20
 #define CMA_MAX_CM_RETRIES 15
 #define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24)
+#define RDMAOE_PACKET_LIFETIME 18
 
 static void cma_add_one(struct ib_device *device);
 static void cma_remove_one(struct ib_device *device);
@@ -174,6 +175,12 @@ struct cma_ndev_work {
struct rdma_cm_eventevent;
 };
 
+struct rdmaoe_mcast_work {
+   struct work_struct   work;
+   struct rdma_id_private  *id;
+   struct cma_multicast*mc;
+};
+
 union cma_ip_addr {
struct in6_addr ip6;
struct {
@@ -348,6 +355,9 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv)
case RDMA_TRANSPORT_IWARP:
iw_addr_get_sgid(dev_addr, gid);
break;
+   case RDMA_TRANSPORT_RDMAOE:
+   rdmaoe_addr_get_sgid(dev_addr, gid);
+   break;
default:
return -ENODEV;
}
@@ -576,10 +586,16 @@ static int cma_ib_init_qp_attr(struct rdma_id_private 
*id_priv,
 {
struct rdma_dev_addr *dev_addr = id_priv-id.route.addr.dev_addr;
int ret;
+   u16 pkey;
+
+if (rdma_port_get_transport(id_priv-id.device, id_priv-id.port_num) 
==
+   RDMA_TRANSPORT_IB)
+   pkey = ib_addr_get_pkey(dev_addr);
+   else
+   pkey = 0x;
 
ret = ib_find_cached_pkey(id_priv-id.device, id_priv-id.port_num,
- ib_addr_get_pkey(dev_addr),
- qp_attr-pkey_index);
+ pkey, qp_attr-pkey_index);
if (ret)
return ret;
 
@@ -609,6 +625,7 @@ int rdma_init_qp_attr(struct rdma_cm_id *id, struct 
ib_qp_attr *qp_attr,
id_priv = container_of(id, struct rdma_id_private, id);
switch (rdma_port_get_transport(id_priv-id.device, 
id_priv-id.port_num)) {
case RDMA_TRANSPORT_IB:
+   case RDMA_TRANSPORT_RDMAOE:
if (!id_priv-cm_id.ib || cma_is_ud_ps(id_priv-id.ps))
ret = cma_ib_init_qp_attr(id_priv, qp_attr, 
qp_attr_mask);
else
@@ -836,7 +853,9 @@ static void cma_leave_mc_groups(struct rdma_id_private 
*id_priv)
mc = container_of(id_priv-mc_list.next,
  struct cma_multicast, list);
list_del(mc-list);
-   ib_sa_free_multicast(mc-multicast.ib);
+   if (rdma_port_get_transport(id_priv-cma_dev-device, 
id_priv-id.port_num) ==
+   RDMA_TRANSPORT_IB)
+   ib_sa_free_multicast(mc-multicast.ib);
kref_put(mc-mcref, release_mc);
}
 }
@@ -855,6 +874,7 @@ void rdma_destroy_id(struct rdma_cm_id *id)
mutex_unlock(lock);
switch (rdma_port_get_transport(id_priv-id.device, 
id_priv-id.port_num)) {
case RDMA_TRANSPORT_IB:
+   case RDMA_TRANSPORT_RDMAOE:
if (id_priv-cm_id.ib  !IS_ERR(id_priv-cm_id.ib))
ib_destroy_cm_id(id_priv-cm_id.ib);
break;
@@ -1512,6 +1532,7 @@ int rdma_listen(struct rdma_cm_id *id, int backlog)
if (id-device) {
switch (rdma_port_get_transport(id-device, id-port_num)) {
case RDMA_TRANSPORT_IB:
+   case RDMA_TRANSPORT_RDMAOE:
ret = cma_ib_listen(id_priv);
if (ret)
goto err;
@@ -1727,6 +1748,65 @@ static int cma_resolve_iw_route(struct rdma_id_private 
*id_priv, int timeout_ms)
return 0;
 }
 
+static int cma_resolve_rdmaoe_route(struct rdma_id_private *id_priv)
+{
+   struct rdma_route *route = id_priv-id.route;
+   struct rdma_addr *addr = route-addr;
+   struct cma_work *work;
+   int ret;
+   struct sockaddr_in

[ofa-general] [PATCHv4 07/10] ib_core: RDMAoE UD packet packing support

2009-08-05 Thread Eli Cohen

Add support functions to aid in packing RDMAoE packets.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/infiniband/core/ud_header.c |  111 +++
 include/rdma/ib_pack.h  |   26 
 2 files changed, 137 insertions(+), 0 deletions(-)

diff --git a/drivers/infiniband/core/ud_header.c 
b/drivers/infiniband/core/ud_header.c
index 8ec7876..d04b6f2 100644
--- a/drivers/infiniband/core/ud_header.c
+++ b/drivers/infiniband/core/ud_header.c
@@ -80,6 +80,29 @@ static const struct ib_field lrh_table[]  = {
  .size_bits= 16 }
 };
 
+static const struct ib_field eth_table[]  = {
+   { STRUCT_FIELD(eth, dmac_h),
+ .offset_words = 0,
+ .offset_bits  = 0,
+ .size_bits= 32 },
+   { STRUCT_FIELD(eth, dmac_l),
+ .offset_words = 1,
+ .offset_bits  = 0,
+ .size_bits= 16 },
+   { STRUCT_FIELD(eth, smac_h),
+ .offset_words = 1,
+ .offset_bits  = 16,
+ .size_bits= 16 },
+   { STRUCT_FIELD(eth, smac_l),
+ .offset_words = 2,
+ .offset_bits  = 0,
+ .size_bits= 32 },
+   { STRUCT_FIELD(eth, type),
+ .offset_words = 3,
+ .offset_bits  = 0,
+ .size_bits= 16 }
+};
+
 static const struct ib_field grh_table[]  = {
{ STRUCT_FIELD(grh, ip_version),
  .offset_words = 0,
@@ -241,6 +264,53 @@ void ib_ud_header_init(int 
payload_bytes,
 EXPORT_SYMBOL(ib_ud_header_init);
 
 /**
+ * ib_rdmaoe_ud_header_init - Initialize UD header structure
+ * @payload_bytes:Length of packet payload
+ * @grh_present:GRH flag (if non-zero, GRH will be included)
+ * @header:Structure to initialize
+ *
+ * ib_rdmaoe_ud_header_init() initializes the grh.ip_version, 
grh.payload_length,
+ * grh.next_header, bth.opcode, bth.pad_count and
+ * bth.transport_header_version fields of a struct eth_ud_header given
+ * the payload length and whether a GRH will be included.
+ */
+void ib_rdmaoe_ud_header_init(int  payload_bytes,
+  int  grh_present,
+  struct eth_ud_header*header)
+{
+   int header_len;
+
+   memset(header, 0, sizeof *header);
+
+   header_len =
+   sizeof header-eth  +
+   IB_BTH_BYTES  +
+   IB_DETH_BYTES;
+   if (grh_present)
+   header_len += IB_GRH_BYTES;
+
+   header-grh_present  = grh_present;
+   if (grh_present) {
+   header-grh.ip_version  = 6;
+   header-grh.payload_length  =
+   cpu_to_be16((IB_BTH_BYTES +
+IB_DETH_BYTES+
+payload_bytes+
+4+ /* ICRC */
+3)  ~3);  /* round up */
+   header-grh.next_header = 0x1b;
+   }
+
+   if (header-immediate_present)
+   header-bth.opcode   = 
IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
+   else
+   header-bth.opcode   = IB_OPCODE_UD_SEND_ONLY;
+   header-bth.pad_count= (4 - payload_bytes)  3;
+   header-bth.transport_header_version = 0;
+}
+EXPORT_SYMBOL(ib_rdmaoe_ud_header_init);
+
+/**
  * ib_ud_header_pack - Pack UD header struct into wire format
  * @header:UD header struct
  * @buf:Buffer to pack into
@@ -281,6 +351,47 @@ int ib_ud_header_pack(struct ib_ud_header *header,
 EXPORT_SYMBOL(ib_ud_header_pack);
 
 /**
+ * rdmaoe_ud_header_pack - Pack UD header struct into eth wire format
+ * @header:UD header struct
+ * @buf:Buffer to pack into
+ *
+ * ib_ud_header_pack() packs the UD header structure @header into wire
+ * format in the buffer @buf.
+ */
+int rdmaoe_ud_header_pack(struct eth_ud_header *header,
+  void *buf)
+{
+   int len = 0;
+
+   ib_pack(eth_table, ARRAY_SIZE(eth_table),
+   header-eth, buf);
+   len += IB_ETH_BYTES;
+
+   if (header-grh_present) {
+   ib_pack(grh_table, ARRAY_SIZE(grh_table),
+   header-grh, buf + len);
+   len += IB_GRH_BYTES;
+   }
+
+   ib_pack(bth_table, ARRAY_SIZE(bth_table),
+   header-bth, buf + len);
+   len += IB_BTH_BYTES;
+
+   ib_pack(deth_table, ARRAY_SIZE(deth_table),
+   header-deth, buf + len);
+   len += IB_DETH_BYTES;
+
+   if (header-immediate_present) {
+   memcpy(buf + len, header-immediate_data,
+  sizeof header-immediate_data);
+   len += sizeof header-immediate_data;
+   }
+
+   return len;
+}
+EXPORT_SYMBOL(rdmaoe_ud_header_pack);
+
+/**
  * ib_ud_header_unpack - Unpack UD header struct from wire format
  * @header:UD header struct
  * @buf:Buffer to pack into
diff --git a/include/rdma/ib_pack.h b/include

[ofa-general] [PATCHv4 08/10] ib_core: Add API to support RDMAoE from userspace

2009-08-05 Thread Eli Cohen

Add ib_uverbs_get_mac() to be used by ibv_create_ah() to retirieve the remore
port's MAC address. Port transport is also returned by ibv_query_port().
ABI version is incremented from 6 to 7.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/infiniband/core/uverbs.h  |1 +
 drivers/infiniband/core/uverbs_cmd.c  |   32 
 drivers/infiniband/core/uverbs_main.c |1 +
 drivers/infiniband/core/verbs.c   |   10 ++
 include/rdma/ib_user_verbs.h  |   21 ++---
 include/rdma/ib_verbs.h   |   12 
 6 files changed, 74 insertions(+), 3 deletions(-)

diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index b3ea958..e69b04c 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -194,5 +194,6 @@ IB_UVERBS_DECLARE_CMD(create_srq);
 IB_UVERBS_DECLARE_CMD(modify_srq);
 IB_UVERBS_DECLARE_CMD(query_srq);
 IB_UVERBS_DECLARE_CMD(destroy_srq);
+IB_UVERBS_DECLARE_CMD(get_mac);
 
 #endif /* UVERBS_H */
diff --git a/drivers/infiniband/core/uverbs_cmd.c 
b/drivers/infiniband/core/uverbs_cmd.c
index 56feab6..012aadf 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -452,6 +452,7 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
resp.active_width= attr.active_width;
resp.active_speed= attr.active_speed;
resp.phys_state  = attr.phys_state;
+   resp.transport   = attr.transport;
 
if (copy_to_user((void __user *) (unsigned long) cmd.response,
 resp, sizeof resp))
@@ -1824,6 +1825,37 @@ err:
return ret;
 }
 
+ssize_t ib_uverbs_get_mac(struct ib_uverbs_file *file, const char __user *buf,
+ int in_len, int out_len)
+{
+   struct ib_uverbs_get_maccmd;
+   struct ib_uverbs_get_mac_resp   resp;
+   int  ret;
+   struct ib_pd*pd;
+
+   if (out_len  sizeof resp)
+   return -ENOSPC;
+
+   if (copy_from_user(cmd, buf, sizeof cmd))
+   return -EFAULT;
+
+   pd = idr_read_pd(cmd.pd_handle, file-ucontext);
+   if (!pd)
+   return -EINVAL;
+
+   ret = ib_get_mac(pd-device, cmd.port, cmd.gid, resp.mac);
+   put_pd_read(pd);
+   if (!ret) {
+   if (copy_to_user((void __user *) (unsigned long) cmd.response,
+resp, sizeof resp))
+   return -EFAULT;
+
+   return in_len;
+   }
+
+   return ret;
+}
+
 ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file,
 const char __user *buf, int in_len, int out_len)
 {
diff --git a/drivers/infiniband/core/uverbs_main.c 
b/drivers/infiniband/core/uverbs_main.c
index eb36a81..2641845 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -108,6 +108,7 @@ static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file 
*file,
[IB_USER_VERBS_CMD_MODIFY_SRQ]  = ib_uverbs_modify_srq,
[IB_USER_VERBS_CMD_QUERY_SRQ]   = ib_uverbs_query_srq,
[IB_USER_VERBS_CMD_DESTROY_SRQ] = ib_uverbs_destroy_srq,
+   [IB_USER_VERBS_CMD_GET_MAC] = ib_uverbs_get_mac,
 };
 
 static struct vfsmount *uverbs_event_mnt;
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 3b2f00b..7cce5d6 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -912,3 +912,13 @@ int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, 
u16 lid)
return qp-device-detach_mcast(qp, gid, lid);
 }
 EXPORT_SYMBOL(ib_detach_mcast);
+
+int ib_get_mac(struct ib_device *device, u8 port, u8 *gid, u8 *mac)
+{
+   if (!device-get_mac)
+   return -ENOSYS;
+
+   return device-get_mac(device, port, gid, mac);
+}
+EXPORT_SYMBOL(ib_get_mac);
+
diff --git a/include/rdma/ib_user_verbs.h b/include/rdma/ib_user_verbs.h
index a17f771..49eee8a 100644
--- a/include/rdma/ib_user_verbs.h
+++ b/include/rdma/ib_user_verbs.h
@@ -42,7 +42,7 @@
  * Increment this value if any changes that break userspace ABI
  * compatibility are made.
  */
-#define IB_USER_VERBS_ABI_VERSION  6
+#define IB_USER_VERBS_ABI_VERSION  7
 
 enum {
IB_USER_VERBS_CMD_GET_CONTEXT,
@@ -81,7 +81,8 @@ enum {
IB_USER_VERBS_CMD_MODIFY_SRQ,
IB_USER_VERBS_CMD_QUERY_SRQ,
IB_USER_VERBS_CMD_DESTROY_SRQ,
-   IB_USER_VERBS_CMD_POST_SRQ_RECV
+   IB_USER_VERBS_CMD_POST_SRQ_RECV,
+   IB_USER_VERBS_CMD_GET_MAC
 };
 
 /*
@@ -205,7 +206,8 @@ struct ib_uverbs_query_port_resp {
__u8  active_width;
__u8  active_speed;
__u8  phys_state;
-   __u8  reserved[3];
+   __u8  transport;
+   __u8  reserved[2];
 };
 
 struct ib_uverbs_alloc_pd {
@@ -621,6 +623,19 @@ struct ib_uverbs_destroy_ah {
__u32 ah_handle;
 };
 
+struct

[ofa-general] [PATCHv4 09/10] mlx4: Add support for RDMAoE - address resolution

2009-08-05 Thread Eli Cohen

The following path handles address vectors creation for RDMAoE ports. mlx4
needs the MAC address of the remote node to include it in the WQE of a UD QP or
in the QP context of connected QPs. Address resolution is done atomically in
the case of a link local address or a multicast GID and otherwise -EINVAL is
returned.  mlx4 transport packets were changed too to accomodate for RDMAoE.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/infiniband/hw/mlx4/ah.c  |  187 --
 drivers/infiniband/hw/mlx4/mlx4_ib.h |   19 +++-
 drivers/infiniband/hw/mlx4/qp.c  |  172 +--
 drivers/net/mlx4/fw.c|3 +-
 include/linux/mlx4/device.h  |   31 ++-
 include/linux/mlx4/qp.h  |8 +-
 6 files changed, 327 insertions(+), 93 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c
index c75ac94..0a015c3 100644
--- a/drivers/infiniband/hw/mlx4/ah.c
+++ b/drivers/infiniband/hw/mlx4/ah.c
@@ -31,63 +31,166 @@
  */
 
 #include mlx4_ib.h
+#include rdma/ib_addr.h
+#include linux/inet.h
+#include linux/string.h
 
-struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
+int mlx4_ib_resolve_grh(struct mlx4_ib_dev *dev, const struct ib_ah_attr 
*ah_attr,
+   u8 *mac, int *is_mcast)
 {
-   struct mlx4_dev *dev = to_mdev(pd-device)-dev;
-   struct mlx4_ib_ah *ah;
+   struct mlx4_ib_rdmaoe *rdmaoe = dev-rdmaoe;
+   struct sockaddr_in6 s6 = {0};
+   struct net_device *netdev;
+   int ifidx;
 
-   ah = kmalloc(sizeof *ah, GFP_ATOMIC);
-   if (!ah)
-   return ERR_PTR(-ENOMEM);
+   *is_mcast = 0;
+   spin_lock(rdmaoe-lock);
+   netdev = rdmaoe-netdevs[ah_attr-port_num - 1];
+   if (!netdev) {
+   spin_unlock(rdmaoe-lock);
+   return -EINVAL;
+   }
+   ifidx = netdev-ifindex;
+   spin_unlock(rdmaoe-lock);
 
-   memset(ah-av, 0, sizeof ah-av);
+   memcpy(s6.sin6_addr.s6_addr, ah_attr-grh.dgid.raw, sizeof 
ah_attr-grh);
+   s6.sin6_family = AF_INET6;
+   s6.sin6_scope_id = ifidx;
+   if (rdma_link_local_addr(s6.sin6_addr))
+   rdma_get_ll_mac(s6.sin6_addr, mac);
+   else if (rdma_is_multicast_addr(s6.sin6_addr)) {
+   rdma_get_mcast_mac(s6.sin6_addr, mac);
+   *is_mcast = 1;
+   } else
+   return -EINVAL;
 
-   ah-av.port_pd = cpu_to_be32(to_mpd(pd)-pdn | (ah_attr-port_num  
24));
-   ah-av.g_slid  = ah_attr-src_path_bits;
-   ah-av.dlid= cpu_to_be16(ah_attr-dlid);
-   if (ah_attr-static_rate) {
-   ah-av.stat_rate = ah_attr-static_rate + MLX4_STAT_RATE_OFFSET;
-   while (ah-av.stat_rate  IB_RATE_2_5_GBPS + 
MLX4_STAT_RATE_OFFSET 
-  !(1  ah-av.stat_rate  dev-caps.stat_rate_support))
-   --ah-av.stat_rate;
-   }
-   ah-av.sl_tclass_flowlabel = cpu_to_be32(ah_attr-sl  28);
+   return 0;
+}
+
+static struct ib_ah *create_ib_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr,
+ struct mlx4_ib_ah *ah)
+{
+   struct mlx4_dev *dev = to_mdev(pd-device)-dev;
+
+   ah-av.ib.port_pd = cpu_to_be32(to_mpd(pd)-pdn | (ah_attr-port_num  
24));
+   ah-av.ib.g_slid  = ah_attr-src_path_bits;
if (ah_attr-ah_flags  IB_AH_GRH) {
-   ah-av.g_slid   |= 0x80;
-   ah-av.gid_index = ah_attr-grh.sgid_index;
-   ah-av.hop_limit = ah_attr-grh.hop_limit;
-   ah-av.sl_tclass_flowlabel |=
+   ah-av.ib.g_slid   |= 0x80;
+   ah-av.ib.gid_index = ah_attr-grh.sgid_index;
+   ah-av.ib.hop_limit = ah_attr-grh.hop_limit;
+   ah-av.ib.sl_tclass_flowlabel |=
cpu_to_be32((ah_attr-grh.traffic_class  20) |
ah_attr-grh.flow_label);
-   memcpy(ah-av.dgid, ah_attr-grh.dgid.raw, 16);
+   memcpy(ah-av.ib.dgid, ah_attr-grh.dgid.raw, 16);
+   }
+
+   ah-av.ib.dlid= cpu_to_be16(ah_attr-dlid);
+   if (ah_attr-static_rate) {
+   ah-av.ib.stat_rate = ah_attr-static_rate + 
MLX4_STAT_RATE_OFFSET;
+   while (ah-av.ib.stat_rate  IB_RATE_2_5_GBPS + 
MLX4_STAT_RATE_OFFSET 
+  !(1  ah-av.ib.stat_rate  
dev-caps.stat_rate_support))
+   --ah-av.ib.stat_rate;
}
+   ah-av.ib.sl_tclass_flowlabel = cpu_to_be32(ah_attr-sl  28);
 
return ah-ibah;
 }
 
+static struct ib_ah *create_rdmaoe_ah(struct ib_pd *pd, struct ib_ah_attr 
*ah_attr,
+  struct mlx4_ib_ah *ah)
+{
+   struct mlx4_ib_dev *ibdev = to_mdev(pd-device);
+   struct mlx4_dev *dev = ibdev-dev;
+   u8 mac[6];
+   int err;
+   int is_mcast;
+
+   err = mlx4_ib_resolve_grh(ibdev, ah_attr, mac, is_mcast);
+   if (err

[ofa-general] [PATCHv4 10/10] mlx4: Add RDMAoE support - allow interfaces to correspond to each other

2009-08-05 Thread Eli Cohen

This patch add support RDMAoE for mlx4. Since mlx4_ib now needs to reference
mlx4_en netdevices, a new mechanism was added. Two new fields were added to
struct mlx4_interface to define a protocol and a get_prot_dev method to
retrieve the corresponding protocol's net device.  An implementation of the new
verb ib_get_port_link_type() - mlx4_ib_get_port_link_type - was added.
mlx4_ib_query_port() has been modified to support eth link types. An interface
is considered to be active if its corresponding eth interface is active. Code
for setting the GID table of a port has been added. Currently, each IB port has
a single GID entry in its table and that GID entery equals the link local IPv6
address.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/infiniband/hw/mlx4/main.c |  309 +
 drivers/net/mlx4/en_main.c|   15 ++-
 drivers/net/mlx4/en_port.c|4 +-
 drivers/net/mlx4/en_port.h|3 +-
 drivers/net/mlx4/intf.c   |   20 +++
 drivers/net/mlx4/main.c   |6 +
 drivers/net/mlx4/mlx4.h   |1 +
 include/linux/mlx4/cmd.h  |1 +
 include/linux/mlx4/driver.h   |   16 ++-
 9 files changed, 335 insertions(+), 40 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/main.c 
b/drivers/infiniband/hw/mlx4/main.c
index ae3d759..737c6b9 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -34,9 +34,12 @@
 #include linux/module.h
 #include linux/init.h
 #include linux/errno.h
+#include linux/netdevice.h
+#include linux/inetdevice.h
 
 #include rdma/ib_smi.h
 #include rdma/ib_user_verbs.h
+#include rdma/ib_addr.h
 
 #include linux/mlx4/driver.h
 #include linux/mlx4/cmd.h
@@ -57,6 +60,15 @@ static const char mlx4_ib_version[] =
DRV_NAME : Mellanox ConnectX InfiniBand driver v
DRV_VERSION  ( DRV_RELDATE )\n;
 
+struct update_gid_work {
+   struct work_struct work;
+   union ib_gid gids[128];
+   int port;
+   struct mlx4_ib_dev *dev;
+};
+
+static struct workqueue_struct *wq;
+
 static void init_query_mad(struct ib_smp *mad)
 {
mad-base_version  = 1;
@@ -152,28 +164,19 @@ out:
return err;
 }
 
-static int mlx4_ib_query_port(struct ib_device *ibdev, u8 port,
- struct ib_port_attr *props)
+static enum rdma_transport_type
+mlx4_ib_port_get_transport(struct ib_device *device, u8 port_num)
 {
-   struct ib_smp *in_mad  = NULL;
-   struct ib_smp *out_mad = NULL;
-   int err = -ENOMEM;
-
-   in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
-   out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
-   if (!in_mad || !out_mad)
-   goto out;
-
-   memset(props, 0, sizeof *props);
-
-   init_query_mad(in_mad);
-   in_mad-attr_id  = IB_SMP_ATTR_PORT_INFO;
-   in_mad-attr_mod = cpu_to_be32(port);
+   struct mlx4_dev *dev = to_mdev(device)-dev;
 
-   err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, 
out_mad);
-   if (err)
-   goto out;
+   return dev-caps.port_mask  (1  (port_num - 1)) ?
+   RDMA_TRANSPORT_IB : RDMA_TRANSPORT_RDMAOE;
+}
 
+static void ib_link_query_port(struct ib_device *ibdev, u8 port,
+  struct ib_port_attr *props,
+  struct ib_smp *out_mad)
+{
props-lid  = be16_to_cpup((__be16 *) (out_mad-data + 16));
props-lmc  = out_mad-data[34]  0x7;
props-sm_lid   = be16_to_cpup((__be16 *) (out_mad-data + 18));
@@ -193,6 +196,67 @@ static int mlx4_ib_query_port(struct ib_device *ibdev, u8 
port,
props-subnet_timeout   = out_mad-data[51]  0x1f;
props-max_vl_num   = out_mad-data[37]  4;
props-init_type_reply  = out_mad-data[41]  4;
+   props-transport= RDMA_TRANSPORT_IB;
+}
+
+static void eth_link_query_port(struct ib_device *ibdev, u8 port,
+   struct ib_port_attr *props,
+   struct ib_smp *out_mad)
+{
+   struct mlx4_ib_rdmaoe *rdmaoe = to_mdev(ibdev)-rdmaoe;
+   struct net_device *ndev;
+
+   props-port_cap_flags   = IB_PORT_CM_SUP;
+   props-gid_tbl_len  = to_mdev(ibdev)-dev-caps.gid_table_len[port];
+   props-max_msg_sz   = to_mdev(ibdev)-dev-caps.max_msg_sz;
+   props-pkey_tbl_len = 1;
+   props-bad_pkey_cntr= be16_to_cpup((__be16 *) (out_mad-data + 46));
+   props-qkey_viol_cntr   = be16_to_cpup((__be16 *) (out_mad-data + 48));
+   props-active_width = 0;
+   props-active_speed = 0;
+   props-max_mtu  = out_mad-data[41]  0xf;
+   props-subnet_timeout   = 0;
+   props-max_vl_num   = out_mad-data[37]  4;
+   props-init_type_reply  = 0;
+   props-transport= RDMA_TRANSPORT_RDMAOE;
+   spin_lock(rdmaoe-lock);
+   ndev = rdmaoe-netdevs[port - 1];
+   if (!ndev)
+   goto out

[ofa-general] [PATCHv4] libibverbs: Add RDMAoE support

2009-08-05 Thread Eli Cohen

Extend the ibv_query_port() verb to return a port transport protocol which can
be one of RDMA_TRANSPORT_IB, RDMA_TRANSPORT_IWARP or RDMA_TRANSPORT_RDMAOE.
This can be used by applications to know if they must use GRH as is the case in
RDMAoE.  Add a new system call to get the MAC address of the remote port that a
UD address vector refers to.  Update ibv_rc_pingpong and ibv_ud_pingpong to
accept a remote GID so that they can be used with an RDMAoE port.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
Changed the reference to a port from link type to protocol type. This
patch is tagged v4 to create correspondence with the kernel patches.


 examples/devinfo.c|   15 
 examples/pingpong.c   |9 +++
 examples/pingpong.h   |2 +
 examples/rc_pingpong.c|   50 
 examples/ud_pingpong.c|   38 +++
 include/infiniband/driver.h   |1 +
 include/infiniband/kern-abi.h |   25 ++--
 include/infiniband/verbs.h|   12 +
 src/cmd.c |   20 
 src/libibverbs.map|1 +
 10 files changed, 155 insertions(+), 18 deletions(-)

diff --git a/examples/devinfo.c b/examples/devinfo.c
index caa5d5f..a42a6dc 100644
--- a/examples/devinfo.c
+++ b/examples/devinfo.c
@@ -175,6 +175,20 @@ static int print_all_port_gids(struct ibv_context *ctx, 
uint8_t port_num, int tb
return rc;
 }
 
+static const char *transport_type_str(enum rdma_transport_type type)
+{
+   switch (type) {
+   case RDMA_TRANSPORT_IB:
+   return IB;
+   case RDMA_TRANSPORT_IWARP:
+   return IWARP;
+   case RDMA_TRANSPORT_RDMAOE:
+   return RDMAOE;
+   default:
+   return Unknown;
+   }
+}
+
 static int print_hca_cap(struct ibv_device *ib_dev, uint8_t ib_port)
 {
struct ibv_context *ctx;
@@ -273,6 +287,7 @@ static int print_hca_cap(struct ibv_device *ib_dev, uint8_t 
ib_port)
printf(\t\t\tsm_lid:\t\t\t%d\n, port_attr.sm_lid);
printf(\t\t\tport_lid:\t\t%d\n, port_attr.lid);
printf(\t\t\tport_lmc:\t\t0x%02x\n, port_attr.lmc);
+   printf(\t\t\ttrasnport_type:\t\t%s\n, 
transport_type_str(port_attr.transport));
 
if (verbose) {
printf(\t\t\tmax_msg_sz:\t\t0x%x\n, 
port_attr.max_msg_sz);
diff --git a/examples/pingpong.c b/examples/pingpong.c
index b916f59..d4a46e4 100644
--- a/examples/pingpong.c
+++ b/examples/pingpong.c
@@ -31,6 +31,8 @@
  */
 
 #include pingpong.h
+#include arpa/inet.h
+#include stdlib.h
 
 enum ibv_mtu pp_mtu_to_enum(int mtu)
 {
@@ -53,3 +55,10 @@ uint16_t pp_get_local_lid(struct ibv_context *context, int 
port)
 
return attr.lid;
 }
+
+int pp_get_port_info(struct ibv_context *context, int port,
+struct ibv_port_attr *attr)
+{
+   return ibv_query_port(context, port, attr);
+}
+
diff --git a/examples/pingpong.h b/examples/pingpong.h
index 71d7c3f..16d3466 100644
--- a/examples/pingpong.h
+++ b/examples/pingpong.h
@@ -37,5 +37,7 @@
 
 enum ibv_mtu pp_mtu_to_enum(int mtu);
 uint16_t pp_get_local_lid(struct ibv_context *context, int port);
+int pp_get_port_info(struct ibv_context *context, int port,
+struct ibv_port_attr *attr);
 
 #endif /* IBV_PINGPONG_H */
diff --git a/examples/rc_pingpong.c b/examples/rc_pingpong.c
index 26fa45c..4250cdf 100644
--- a/examples/rc_pingpong.c
+++ b/examples/rc_pingpong.c
@@ -67,6 +67,8 @@ struct pingpong_context {
int  size;
int  rx_depth;
int  pending;
+   struct ibv_port_attr portinfo;
+   union ibv_giddgid;
 };
 
 struct pingpong_dest {
@@ -94,6 +96,12 @@ static int pp_connect_ctx(struct pingpong_context *ctx, int 
port, int my_psn,
.port_num   = port
}
};
+
+   if (ctx-dgid.global.interface_id) {
+   attr.ah_attr.is_global = 1;
+   attr.ah_attr.grh.hop_limit = 1;
+   attr.ah_attr.grh.dgid = ctx-dgid;
+   }
if (ibv_modify_qp(ctx-qp, attr,
  IBV_QP_STATE  |
  IBV_QP_AV |
@@ -289,11 +297,11 @@ out:
 
 static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int 
size,
int rx_depth, int port,
-   int use_event)
+   int use_event, int is_server)
 {
struct pingpong_context *ctx;
 
-   ctx = malloc(sizeof *ctx);
+   ctx = calloc(1, sizeof *ctx);
if (!ctx)
return NULL;
 
@@ -306,7 +314,7 @@ static struct pingpong_context *pp_init_ctx(struct 
ibv_device *ib_dev, int size,
return NULL;
}
 
-   memset

[ofa-general] [PATCHv4] libmlx4: Add RDMAoE support

2009-08-05 Thread Eli Cohen

Modify mlx4_create_ah() to check the port's transport protocol, and for the
case of RDMAoE ports, do a system call to retrieve the remote port's MAC
address. Make modifications to address vector data structs and code to
accomodate for RDMAoE.
---
Changed the reference to a port from link type to protocol type. This
patch is tagged v4 to create correspondence with the kernel patches.

 src/mlx4.h  |3 +++
 src/qp.c|2 ++
 src/verbs.c |   29 +
 src/wqe.h   |3 ++-
 4 files changed, 36 insertions(+), 1 deletions(-)

diff --git a/src/mlx4.h b/src/mlx4.h
index 827a201..20d3fdd 100644
--- a/src/mlx4.h
+++ b/src/mlx4.h
@@ -236,11 +236,14 @@ struct mlx4_av {
uint8_t hop_limit;
uint32_tsl_tclass_flowlabel;
uint8_t dgid[16];
+   uint8_t mac[8];
 };
 
 struct mlx4_ah {
struct ibv_ah   ibv_ah;
struct mlx4_av  av;
+   uint16_tvlan;
+   uint8_t mac[6];
 };
 
 static inline unsigned long align(unsigned long val, unsigned long align)
diff --git a/src/qp.c b/src/qp.c
index d194ae3..cd8fab0 100644
--- a/src/qp.c
+++ b/src/qp.c
@@ -143,6 +143,8 @@ static void set_datagram_seg(struct mlx4_wqe_datagram_seg 
*dseg,
memcpy(dseg-av, to_mah(wr-wr.ud.ah)-av, sizeof (struct mlx4_av));
dseg-dqpn = htonl(wr-wr.ud.remote_qpn);
dseg-qkey = htonl(wr-wr.ud.remote_qkey);
+   dseg-vlan = htons(to_mah(wr-wr.ud.ah)-vlan);
+   memcpy(dseg-mac, to_mah(wr-wr.ud.ah)-mac, 6);
 }
 
 static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg)
diff --git a/src/verbs.c b/src/verbs.c
index cc179a0..e60ab05 100644
--- a/src/verbs.c
+++ b/src/verbs.c
@@ -614,9 +614,21 @@ int mlx4_destroy_qp(struct ibv_qp *ibqp)
return 0;
 }
 
+static int mcast_mac(uint8_t *mac)
+{
+   int i;
+   uint8_t val = 0xff;
+
+   for (i = 0; i  6; ++i)
+   val = mac[i];
+
+   return val == 0xff;
+}
+
 struct ibv_ah *mlx4_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr)
 {
struct mlx4_ah *ah;
+   struct ibv_port_attr port_attr;
 
ah = malloc(sizeof *ah);
if (!ah)
@@ -642,7 +654,24 @@ struct ibv_ah *mlx4_create_ah(struct ibv_pd *pd, struct 
ibv_ah_attr *attr)
memcpy(ah-av.dgid, attr-grh.dgid.raw, 16);
}
 
+   if (ibv_query_port(pd-context, attr-port_num, port_attr))
+   goto err;
+
+   if (port_attr.transport == RDMA_TRANSPORT_RDMAOE) {
+   if (ibv_cmd_get_mac(pd, attr-port_num, ah-av.dgid, ah-mac))
+   goto err;
+
+   ah-vlan = 0;
+   if (mcast_mac(ah-mac))
+   ah-av.dlid = htons(0xc000);
+
+   }
+
+
return ah-ibv_ah;
+err:
+   free(ah);
+   return NULL;
 }
 
 int mlx4_destroy_ah(struct ibv_ah *ah)
diff --git a/src/wqe.h b/src/wqe.h
index 6f7f309..ea6f27f 100644
--- a/src/wqe.h
+++ b/src/wqe.h
@@ -78,7 +78,8 @@ struct mlx4_wqe_datagram_seg {
uint32_tav[8];
uint32_tdqpn;
uint32_tqkey;
-   uint32_treserved[2];
+   __be16  vlan;
+   uint8_t mac[6];
 };
 
 struct mlx4_wqe_data_seg {
-- 
1.6.3.3

___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Re: [ofa-general] [PATCH] cma: fix access to freed memory

2009-08-05 Thread Eli Cohen

On Wed, Aug 05, 2009 at 08:46:43AM -0700, Sean Hefty wrote:

 rdma_destroy_id and rdma_leave_multicast call ib_sa_free_multicast.  This call
 will block until the join callback completes or is canceled.  Can you describe
 the race with cma_ib_mc_handler in more detail?

That explains it. I was using a different join implementation for
RDMAoE without a leave operation so I had to use this kref solution.
So no need to this patch. I will provide a distinct solution for the RDMAoE
case.

Thanks.

___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

[ofa-general] [PATCH v2] cma: fix access to freed memory

2009-08-04 Thread Eli Cohen

rdma_join_multicast() allocates struct cma_multicast and then proceeds to join
to a multicast address. However, the join operation completes in another
context and the allocated struct could be released if the user destroys either
the rdma_id object or decides to leave the multicast group while the join
operation is in progress. This patch uses a kref object to maintain reference
counting to avoid such situation.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---

Changes from previous version: I removed the protection of mc list
manipulation using spinlocks becuase -
a. In order to break into different patches 
b. I have doubts as for the necessity of this protection.

 drivers/infiniband/core/cma.c |   20 
 1 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 851de83..aa62101 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -157,6 +157,7 @@ struct cma_multicast {
struct list_headlist;
void*context;
struct sockaddr_storage addr;
+   struct kref mcref;
 };
 
 struct cma_work {
@@ -290,6 +291,13 @@ static inline void cma_deref_dev(struct cma_device 
*cma_dev)
complete(cma_dev-comp);
 }
 
+void release_mc(struct kref *kref)
+{
+   struct cma_multicast *mc = container_of(kref, struct cma_multicast, 
mcref);
+
+   kfree(mc);
+}
+
 static void cma_detach_from_dev(struct rdma_id_private *id_priv)
 {
list_del(id_priv-list);
@@ -827,7 +835,7 @@ static void cma_leave_mc_groups(struct rdma_id_private 
*id_priv)
  struct cma_multicast, list);
list_del(mc-list);
ib_sa_free_multicast(mc-multicast.ib);
-   kfree(mc);
+   kref_put(mc-mcref, release_mc);
}
 }
 
@@ -2643,7 +2651,7 @@ static int cma_ib_mc_handler(int status, struct 
ib_sa_multicast *multicast)
id_priv = mc-id_priv;
if (cma_disable_callback(id_priv, CMA_ADDR_BOUND) 
cma_disable_callback(id_priv, CMA_ADDR_RESOLVED))
-   return 0;
+   goto out;
 
mutex_lock(id_priv-qp_mutex);
if (!status  id_priv-id.qp)
@@ -2669,10 +2677,12 @@ static int cma_ib_mc_handler(int status, struct 
ib_sa_multicast *multicast)
cma_exch(id_priv, CMA_DESTROYING);
mutex_unlock(id_priv-handler_mutex);
rdma_destroy_id(id_priv-id);
-   return 0;
+   goto out;
}
 
mutex_unlock(id_priv-handler_mutex);
+out:
+   kref_put(mc-mcref, release_mc);
return 0;
 }
 
@@ -2759,11 +2769,13 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct 
sockaddr *addr,
memcpy(mc-addr, addr, ip_addr_size(addr));
mc-context = context;
mc-id_priv = id_priv;
+   kref_init(mc-mcref);
 
spin_lock(id_priv-lock);
list_add(mc-list, id_priv-mc_list);
spin_unlock(id_priv-lock);
 
+   kref_get(mc-mcref);
switch (rdma_node_get_transport(id-device-node_type)) {
case RDMA_TRANSPORT_IB:
ret = cma_join_ib_multicast(id_priv, mc);
@@ -2800,7 +2812,7 @@ void rdma_leave_multicast(struct rdma_cm_id *id, struct 
sockaddr *addr)
mc-multicast.ib-rec.mgid,
mc-multicast.ib-rec.mlid);
ib_sa_free_multicast(mc-multicast.ib);
-   kfree(mc);
+   kref_put(mc-mcref, release_mc);
return;
}
}
-- 
1.6.3.3

___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Re: [ofa-general] Re: [PATCH] cma: fix access to freed memory

2009-08-04 Thread Eli Cohen

On Tue, Aug 04, 2009 at 09:05:13AM -0700, Roland Dreier wrote:
 
   Maybe it's just a loose connection but yet, it seems to me that
   operations on id_priv-mc_list should be protected. Should I send a
   different patch?
 
 seems ... should be is very weak justification for locking.  What
 should they be protected from?
 

What if rdma_join_multicast() is called when rdma_destroy_id() - for
example from cma_ib_handler() due to error returned from the handler?
In this case list_add(mc-list, id_priv-mc_list) in
rdma_join_multicast() can may be executed along with the list
manipulation done in cma_leave_mc_groups().

Generally, it looks strange that in some places list handling is
protected with a spinlock and in other places not.
___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

[ofa-general] [PATCH] cma: fix access to freed memory

2009-08-03 Thread Eli Cohen

rdma_join_multicast() allocates struct cma_multicast and then proceeds to join
to a multicast address. However, the join operation completes in another
context and the allocated struct could be released if the user destroys either
the rdma_id object or decides to leave the multicast group while the join is in
progress. This patch uses reference counting to to avoid such situation. It
also protects removal from id_priv-mc_list in cma_leave_mc_groups().

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/infiniband/core/cma.c |   23 +++
 1 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 851de83..8fee477 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -157,6 +157,7 @@ struct cma_multicast {
struct list_headlist;
void*context;
struct sockaddr_storage addr;
+   atomic_trefcount;
 };
 
 struct cma_work {
@@ -290,6 +291,12 @@ static inline void cma_deref_dev(struct cma_device 
*cma_dev)
complete(cma_dev-comp);
 }
 
+void cma_deref_mc(struct cma_multicast *mc)
+{
+   if (atomic_dec_and_test(mc-refcount))
+   kfree(mc);
+}
+
 static void cma_detach_from_dev(struct rdma_id_private *id_priv)
 {
list_del(id_priv-list);
@@ -822,13 +829,17 @@ static void cma_leave_mc_groups(struct rdma_id_private 
*id_priv)
 {
struct cma_multicast *mc;
 
+   spin_lock_irq(id_priv-lock);
while (!list_empty(id_priv-mc_list)) {
mc = container_of(id_priv-mc_list.next,
  struct cma_multicast, list);
list_del(mc-list);
+   spin_unlock_irq(id_priv-lock);
ib_sa_free_multicast(mc-multicast.ib);
-   kfree(mc);
+   cma_deref_mc(mc);
+   spin_lock_irq(id_priv-lock);
}
+   spin_unlock_irq(id_priv-lock);
 }
 
 void rdma_destroy_id(struct rdma_cm_id *id)
@@ -2643,7 +2654,7 @@ static int cma_ib_mc_handler(int status, struct 
ib_sa_multicast *multicast)
id_priv = mc-id_priv;
if (cma_disable_callback(id_priv, CMA_ADDR_BOUND) 
cma_disable_callback(id_priv, CMA_ADDR_RESOLVED))
-   return 0;
+   goto out;
 
mutex_lock(id_priv-qp_mutex);
if (!status  id_priv-id.qp)
@@ -2669,10 +2680,12 @@ static int cma_ib_mc_handler(int status, struct 
ib_sa_multicast *multicast)
cma_exch(id_priv, CMA_DESTROYING);
mutex_unlock(id_priv-handler_mutex);
rdma_destroy_id(id_priv-id);
-   return 0;
+   goto out;
}
 
mutex_unlock(id_priv-handler_mutex);
+out:
+   cma_deref_mc(mc);
return 0;
 }
 
@@ -2759,11 +2772,13 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct 
sockaddr *addr,
memcpy(mc-addr, addr, ip_addr_size(addr));
mc-context = context;
mc-id_priv = id_priv;
+   atomic_set(mc-refcount, 1);
 
spin_lock(id_priv-lock);
list_add(mc-list, id_priv-mc_list);
spin_unlock(id_priv-lock);
 
+   atomic_inc(mc-refcount);
switch (rdma_node_get_transport(id-device-node_type)) {
case RDMA_TRANSPORT_IB:
ret = cma_join_ib_multicast(id_priv, mc);
@@ -2800,7 +2815,7 @@ void rdma_leave_multicast(struct rdma_cm_id *id, struct 
sockaddr *addr)
mc-multicast.ib-rec.mgid,
mc-multicast.ib-rec.mlid);
ib_sa_free_multicast(mc-multicast.ib);
-   kfree(mc);
+   cma_deref_mc(mc);
return;
}
}
-- 
1.6.3.3

___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

[ofa-general] [PATCH] mlx4_core: map sufficient ICM memory for EQs

2009-07-30 Thread Eli Cohen

Current implementation allocates a single host page for EQ context memory. As
the number of CPU cores increases, and since the number of required EQs depends
on this number, this patch removes the hard coded limit and makes the
allocation dependent on EQ entry size and the number of required EQs.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/net/mlx4/eq.c   |   42 --
 drivers/net/mlx4/main.c |1 +
 drivers/net/mlx4/mlx4.h |1 +
 include/linux/mlx4/device.h |1 +
 4 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/drivers/net/mlx4/eq.c b/drivers/net/mlx4/eq.c
index b9ceddd..1d41b1a 100644
--- a/drivers/net/mlx4/eq.c
+++ b/drivers/net/mlx4/eq.c
@@ -530,29 +530,34 @@ int mlx4_map_eq_icm(struct mlx4_dev *dev, u64 icm_virt)
 {
struct mlx4_priv *priv = mlx4_priv(dev);
int ret;
+   int host_pages, icm_pages;
+   int i;
 
-   /*
-* We assume that mapping one page is enough for the whole EQ
-* context table.  This is fine with all current HCAs, because
-* we only use 32 EQs and each EQ uses 64 bytes of context
-* memory, or 1 KB total.
-*/
+   host_pages = ALIGN(min_t(int, dev-caps.num_eqs, num_possible_cpus() + 
1) *
+  dev-caps.eqc_entry_size, PAGE_SIZE)  PAGE_SHIFT;
+   priv-eq_table.order = ilog2(roundup_pow_of_two(host_pages));
priv-eq_table.icm_virt = icm_virt;
-   priv-eq_table.icm_page = alloc_page(GFP_HIGHUSER);
+   priv-eq_table.icm_page = alloc_pages(GFP_HIGHUSER, 
priv-eq_table.order);
if (!priv-eq_table.icm_page)
return -ENOMEM;
priv-eq_table.icm_dma  = pci_map_page(dev-pdev, 
priv-eq_table.icm_page, 0,
-  PAGE_SIZE, 
PCI_DMA_BIDIRECTIONAL);
+  PAGE_SIZE  
priv-eq_table.order,
+  PCI_DMA_BIDIRECTIONAL);
if (pci_dma_mapping_error(dev-pdev, priv-eq_table.icm_dma)) {
-   __free_page(priv-eq_table.icm_page);
+   __free_pages(priv-eq_table.icm_page, priv-eq_table.order);
return -ENOMEM;
}
 
-   ret = mlx4_MAP_ICM_page(dev, priv-eq_table.icm_dma, icm_virt);
-   if (ret) {
-   pci_unmap_page(dev-pdev, priv-eq_table.icm_dma, PAGE_SIZE,
-  PCI_DMA_BIDIRECTIONAL);
-   __free_page(priv-eq_table.icm_page);
+   icm_pages = (PAGE_SIZE / MLX4_ICM_PAGE_SIZE) * (1  
priv-eq_table.order);
+   for (i = 0; i  icm_pages; ++i) {
+   ret = mlx4_MAP_ICM_page(dev, priv-eq_table.icm_dma, icm_virt + 
i * MLX4_ICM_PAGE_SIZE);
+   if (ret) {
+   mlx4_UNMAP_ICM(dev, priv-eq_table.icm_virt, i);
+   pci_unmap_page(dev-pdev, priv-eq_table.icm_dma, 
PAGE_SIZE,
+  PCI_DMA_BIDIRECTIONAL);
+   __free_pages(priv-eq_table.icm_page, 
priv-eq_table.order);
+   break;
+   }
}
 
return ret;
@@ -561,11 +566,12 @@ int mlx4_map_eq_icm(struct mlx4_dev *dev, u64 icm_virt)
 void mlx4_unmap_eq_icm(struct mlx4_dev *dev)
 {
struct mlx4_priv *priv = mlx4_priv(dev);
+   int icm_pages = (PAGE_SIZE / MLX4_ICM_PAGE_SIZE) * (1  
priv-eq_table.order);
 
-   mlx4_UNMAP_ICM(dev, priv-eq_table.icm_virt, 1);
-   pci_unmap_page(dev-pdev, priv-eq_table.icm_dma, PAGE_SIZE,
-  PCI_DMA_BIDIRECTIONAL);
-   __free_page(priv-eq_table.icm_page);
+   mlx4_UNMAP_ICM(dev, priv-eq_table.icm_virt, icm_pages);
+   pci_unmap_page(dev-pdev, priv-eq_table.icm_dma,
+  PAGE_SIZE  priv-eq_table.order, 
PCI_DMA_BIDIRECTIONAL);
+   __free_pages(priv-eq_table.icm_page, priv-eq_table.order);
 }
 
 int mlx4_alloc_eq_table(struct mlx4_dev *dev)
diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c
index dac621b..872becd 100644
--- a/drivers/net/mlx4/main.c
+++ b/drivers/net/mlx4/main.c
@@ -207,6 +207,7 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct 
mlx4_dev_cap *dev_cap)
dev-caps.max_cqes   = dev_cap-max_cq_sz - 1;
dev-caps.reserved_cqs   = dev_cap-reserved_cqs;
dev-caps.reserved_eqs   = dev_cap-reserved_eqs;
+   dev-caps.eqc_entry_size = dev_cap-eqc_entry_sz;
dev-caps.mtts_per_seg   = 1  log_mtts_per_seg;
dev-caps.reserved_mtts  = DIV_ROUND_UP(dev_cap-reserved_mtts,
dev-caps.mtts_per_seg);
diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h
index 5bd79c2..1a20fa3 100644
--- a/drivers/net/mlx4/mlx4.h
+++ b/drivers/net/mlx4/mlx4.h
@@ -211,6 +211,7 @@ struct mlx4_eq_table {
struct mlx4_icm_table   cmpt_table;
int have_irq;
u8  inta_pin;
+   int

Re: [ofa-general] Re: [ewg] [PATCH 1/8 v3] ib_core: Add API to support RDMAoE

2009-07-14 Thread Eli Cohen

On Mon, Jul 13, 2009 at 03:26:06PM -0400, Hal Rosenstock wrote:
 
  +enum ib_port_link_type ib_get_port_link_type(struct ib_device *device, u8 
  port_num)
  +{
  +       return device-get_port_link_type ?
  +               device-get_port_link_type(device, port_num) : PORT_LINK_IB;
 
 So do iWARP devices return PORT_LINK_IB ? If so, that seems a little
 weird to me.
 
 -- Hal
 

Maybe it's more appropriate to make this function mandatory and
require all drivers to report the correct port type. What do you
think?
___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

[ofa-general] Re: [ewg] [PATCH 2/8 v3] ib_core: RDMAoE support only QP1

2009-07-14 Thread Eli Cohen

On Mon, Jul 13, 2009 at 03:26:34PM -0400, Hal Rosenstock wrote:
 On Mon, Jul 13, 2009 at 2:14 PM, Eli Cohene...@mellanox.co.il wrote:
  Since RDMAoE is using Ethernet as its link layer, there is no need for QP0. 
  QP1
  is still needed since it handles communications between CM agents. This 
  patch
  will create only QP1 for RDMAoE ports.
 
 What happens with other QP1 traffic (other than CM and SA) ?
I think it should work but I haven't tried that.


 Userspace
 can access QP1 (and QP0).
QP0 is not accessible since ib_register_mad_agent() will fail for QP0
becuase of this:

if (!port_priv-qp_info[qp_type].qp)
return NULL;

QP1 should work in the same way



 Does QP0 error out ? What about QP1 ? Does
 it just timeout ? If so, a direct error would be better.
 

See above - you can't access QP0. Do you know of a utility from
userspace which sends/receives MADs on QP0 or QP1?
___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

[ofa-general] Re: [ewg] [PATCH 2/8 v3] ib_core: RDMAoE support only QP1

2009-07-14 Thread Eli Cohen

On Tue, Jul 14, 2009 at 07:15:44AM -0400, Hal Rosenstock wrote:
 On Tue, Jul 14, 2009 at 3:46 AM, Eli Cohene...@dev.mellanox.co.il wrote:
  On Mon, Jul 13, 2009 at 03:26:34PM -0400, Hal Rosenstock wrote:
  On Mon, Jul 13, 2009 at 2:14 PM, Eli Cohene...@mellanox.co.il wrote:
   Since RDMAoE is using Ethernet as its link layer, there is no need for 
   QP0. QP1
   is still needed since it handles communications between CM agents. This 
   patch
   will create only QP1 for RDMAoE ports.
 
  What happens with other QP1 traffic (other than CM and SA) ?
  I think it should work but I haven't tried that.
 
 Would you ? You could try tools from infiniband-diags or ibdiagnet.
Yes I would try that. But I need something that will not fail because
it could not open QP0. For example, something that uses only QP1. Are
the any in ibutils?

 
  Userspace
  can access QP1 (and QP0).
  QP0 is not accessible since ib_register_mad_agent() will fail for QP0
  becuase of this:
 
         if (!port_priv-qp_info[qp_type].qp)
                 return NULL;
 
  QP1 should work in the same way
 
 So what happens with things like PerfMgt class ? I think it ends up
 timing out if no receiver consumer is present.
 
  Does QP0 error out ? What about QP1 ? Does
  it just timeout ? If so, a direct error would be better.
 
 
  See above - you can't access QP0. Do you know of a utility from
  userspace which sends/receives MADs on QP0 or QP1?
 
 Yes, opensm, infiniband-diags (various), and ibutils (ibdiagnet, etc).
 
 -- Hal
___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

[ofa-general] [PATCH 0/8 v3] RDMAoE support

2009-07-13 Thread Eli Cohen

RDMA over Ethernet (RDMAoE) allows running the IB transport protocol
using Ethernet frames allowing the deployment of IB semantics on
lossless Ethernet fabrics. RDMAoE packets are standard Ethernet frames
with an IEEE assigned Ethertype, a GRH, unmodified IB transport
headers and payload. Aside from the considerations pointed out below,
RDMAoE ports are functionally equivalent to regular IB ports from the
RDMA stack perspective.

IB subnet management and SA services are not required for RDMAoE
operation; Ethernet management practices are used instead. In
Ethernet, nodes are commonly referred to by applications by means of
an IP address. RDMAoE encodes the IP addresses that were assigned to
the corresponding Ethernet port into its GIDs, and makes use of the IP
stack to bind a destination address to the corresponding netdevice
(just as the CMA does today for IB and iWARP) and to obtain its L2 MAC
addresses.

The RDMA Verbs API is syntactically unmodified. When referring to
RDMAoE ports, Address handles are required to contain GIDs and the L2
address fields in the API are ignored. The Ethernet L2 information is
then obtained by the vendor-specific driver (both in kernel- and
user-space) while modifying QPs to RTR and creating address handles.

In order to maximize transparency for applications, RDMAoE implements
a dedicated API that provides services equivalent to some of those
provided by the IB-SA. The current approach is strictly local but may
evolve in the future. This API is implemented using an independent
source code file which allows for seamless evolution of the code
without affecting the IB native SA interfaces. We have successfully
tested MPI, SDP, RDS, and native Verbs applications over RDMAoE.

To enable RDMAoE with the mlx4 driver stack, both the mlx4_en and
mlx4_ib drivers must be loaded, and the netdevice for the
corresponding RDMAoE port must be running. Individual ports of a multi
port HCA can be independently configured as Ethernet (with support for
RDMAoE) or IB, as is already the case.

Following is a series of 8 patches based on version 2.6.30 of the
Linux kernel. This new series reflects changes based on feedback from
the community on the previous set of patches. The whole series is
tagged v3.

Signed-off-by: Eli Cohen e...@mellanox.co.il


 drivers/infiniband/core/Makefile  |2 
 drivers/infiniband/core/addr.c|   20 
 drivers/infiniband/core/agent.c   |   12 
 drivers/infiniband/core/cma.c |  124 +++
 drivers/infiniband/core/mad.c |   48 +
 drivers/infiniband/core/multicast.c   |   43 -
 drivers/infiniband/core/multicast.h   |   79 ++
 drivers/infiniband/core/rdmaoe_sa.c   |  942 ++
 drivers/infiniband/core/sa.h  |   24 
 drivers/infiniband/core/sa_query.c|   26 
 drivers/infiniband/core/ud_header.c   |  111 +++
 drivers/infiniband/core/uverbs.h  |1 
 drivers/infiniband/core/uverbs_cmd.c  |   33 +
 drivers/infiniband/core/uverbs_main.c |1 
 drivers/infiniband/core/verbs.c   |   17 
 drivers/infiniband/hw/mlx4/ah.c   |  228 ++-
 drivers/infiniband/hw/mlx4/main.c |  276 +++-
 drivers/infiniband/hw/mlx4/mlx4_ib.h  |   30 
 drivers/infiniband/hw/mlx4/qp.c   |  253 ++--
 drivers/infiniband/ulp/ipoib/ipoib_main.c |3 
 drivers/net/mlx4/cmd.c|6 
 drivers/net/mlx4/en_main.c|   15 
 drivers/net/mlx4/en_port.c|4 
 drivers/net/mlx4/en_port.h|3 
 drivers/net/mlx4/intf.c   |   20 
 drivers/net/mlx4/main.c   |6 
 drivers/net/mlx4/mlx4.h   |1 
 include/linux/mlx4/cmd.h  |1 
 include/linux/mlx4/device.h   |   31 
 include/linux/mlx4/driver.h   |   16 
 include/linux/mlx4/qp.h   |8 
 include/rdma/ib_addr.h|   53 +
 include/rdma/ib_pack.h|   26 
 include/rdma/ib_user_verbs.h  |   21 
 include/rdma/ib_verbs.h   |   22 
 include/rdma/rdmaoe_sa.h  |   66 ++
 36 files changed, 2333 insertions(+), 239 deletions(-)
___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

[ofa-general] [PATCH 1/8 v3] ib_core: Add API to support RDMAoE

2009-07-13 Thread Eli Cohen

Add two API functions needed for RDMAoE.

ib_get_port_link_type() returns the link type support by the given device's
port. It can be either PORT_LINK_IB for IB link layer or PORT_LINK_ETH for
Ethernet links. Link type is reported to in query_port verb.

ib_get_mac() will return the Ethernet MAC address leading to the port whose GID
is spcified. This function is exported to userspace applications.

ABI version is incremented from 6 to 7.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/infiniband/core/uverbs.h  |1 +
 drivers/infiniband/core/uverbs_cmd.c  |   33 +
 drivers/infiniband/core/uverbs_main.c |1 +
 drivers/infiniband/core/verbs.c   |   17 +
 include/rdma/ib_user_verbs.h  |   21 ++---
 include/rdma/ib_verbs.h   |   22 ++
 6 files changed, 92 insertions(+), 3 deletions(-)

diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index b3ea958..e69b04c 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -194,5 +194,6 @@ IB_UVERBS_DECLARE_CMD(create_srq);
 IB_UVERBS_DECLARE_CMD(modify_srq);
 IB_UVERBS_DECLARE_CMD(query_srq);
 IB_UVERBS_DECLARE_CMD(destroy_srq);
+IB_UVERBS_DECLARE_CMD(get_mac);
 
 #endif /* UVERBS_H */
diff --git a/drivers/infiniband/core/uverbs_cmd.c 
b/drivers/infiniband/core/uverbs_cmd.c
index 56feab6..eefc414 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -452,6 +452,7 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
resp.active_width= attr.active_width;
resp.active_speed= attr.active_speed;
resp.phys_state  = attr.phys_state;
+   resp.link_type   = attr.link_type;
 
if (copy_to_user((void __user *) (unsigned long) cmd.response,
 resp, sizeof resp))
@@ -1824,6 +1825,38 @@ err:
return ret;
 }
 
+ssize_t ib_uverbs_get_mac(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+   struct ib_uverbs_get_maccmd;
+   struct ib_uverbs_get_mac_resp   resp;
+   int  ret;
+   struct ib_pd*pd;
+
+   if (out_len  sizeof resp)
+   return -ENOSPC;
+
+   if (copy_from_user(cmd, buf, sizeof cmd))
+   return -EFAULT;
+
+   pd = idr_read_pd(cmd.pd_handle, file-ucontext);
+   if (!pd)
+   return -EINVAL;
+
+   ret = ib_get_mac(pd-device, cmd.port, cmd.gid, resp.mac);
+   put_pd_read(pd);
+   if (!ret) {
+   if (copy_to_user((void __user *) (unsigned long) cmd.response,
+resp, sizeof resp)) {
+   return -EFAULT;
+   }
+   return in_len;
+   }
+
+   return ret;
+}
+
 ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file,
 const char __user *buf, int in_len, int out_len)
 {
diff --git a/drivers/infiniband/core/uverbs_main.c 
b/drivers/infiniband/core/uverbs_main.c
index eb36a81..b2f148f 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -108,6 +108,7 @@ static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file 
*file,
[IB_USER_VERBS_CMD_MODIFY_SRQ]  = ib_uverbs_modify_srq,
[IB_USER_VERBS_CMD_QUERY_SRQ]   = ib_uverbs_query_srq,
[IB_USER_VERBS_CMD_DESTROY_SRQ] = ib_uverbs_destroy_srq,
+   [IB_USER_VERBS_CMD_GET_MAC] = ib_uverbs_get_mac
 };
 
 static struct vfsmount *uverbs_event_mnt;
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index a7da9be..bde5b0d 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -94,6 +94,13 @@ rdma_node_get_transport(enum rdma_node_type node_type)
 }
 EXPORT_SYMBOL(rdma_node_get_transport);
 
+enum ib_port_link_type ib_get_port_link_type(struct ib_device *device, u8 
port_num)
+{
+   return device-get_port_link_type ?
+   device-get_port_link_type(device, port_num) : PORT_LINK_IB;
+}
+EXPORT_SYMBOL(ib_get_port_link_type);
+
 /* Protection domains */
 
 struct ib_pd *ib_alloc_pd(struct ib_device *device)
@@ -904,3 +911,13 @@ int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, 
u16 lid)
return qp-device-detach_mcast(qp, gid, lid);
 }
 EXPORT_SYMBOL(ib_detach_mcast);
+
+int ib_get_mac(struct ib_device *device, u8 port, u8 *gid, u8 *mac)
+{
+   if (!device-get_mac)
+   return -ENOSYS;
+
+   return device-get_mac(device, port, gid, mac);
+}
+EXPORT_SYMBOL(ib_get_mac);
+
diff --git a/include/rdma/ib_user_verbs.h b/include/rdma/ib_user_verbs.h
index a17f771..184203b 100644
--- a/include/rdma/ib_user_verbs.h
+++ b/include/rdma/ib_user_verbs.h
@@ -42,7 +42,7 @@
  * Increment this value if any changes that break userspace ABI

[ofa-general] [PATCH 2/8 v3] ib_core: RDMAoE support only QP1

2009-07-13 Thread Eli Cohen

Since RDMAoE is using Ethernet as its link layer, there is no need for QP0. QP1
is still needed since it handles communications between CM agents. This patch
will create only QP1 for RDMAoE ports.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/infiniband/core/agent.c |   12 ++---
 drivers/infiniband/core/mad.c   |   48 ++-
 2 files changed, 45 insertions(+), 15 deletions(-)

diff --git a/drivers/infiniband/core/agent.c b/drivers/infiniband/core/agent.c
index ae7c288..c3f2048 100644
--- a/drivers/infiniband/core/agent.c
+++ b/drivers/infiniband/core/agent.c
@@ -48,6 +48,8 @@
 struct ib_agent_port_private {
struct list_head port_list;
struct ib_mad_agent *agent[2];
+   struct ib_device*device;
+   u8   port_num;
 };
 
 static DEFINE_SPINLOCK(ib_agent_port_list_lock);
@@ -58,11 +60,10 @@ __ib_get_agent_port(struct ib_device *device, int port_num)
 {
struct ib_agent_port_private *entry;
 
-   list_for_each_entry(entry, ib_agent_port_list, port_list) {
-   if (entry-agent[0]-device == device 
-   entry-agent[0]-port_num == port_num)
+   list_for_each_entry(entry, ib_agent_port_list, port_list)
+   if (entry-device == device  entry-port_num == port_num)
return entry;
-   }
+
return NULL;
 }
 
@@ -175,6 +176,9 @@ int ib_agent_port_open(struct ib_device *device, int 
port_num)
goto error3;
}
 
+   port_priv-device = device;
+   port_priv-port_num = port_num;
+
spin_lock_irqsave(ib_agent_port_list_lock, flags);
list_add_tail(port_priv-port_list, ib_agent_port_list);
spin_unlock_irqrestore(ib_agent_port_list_lock, flags);
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index de922a0..3d5449f 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -199,6 +199,16 @@ struct ib_mad_agent *ib_register_mad_agent(struct 
ib_device *device,
unsigned long flags;
u8 mgmt_class, vclass;
 
+   /* Validate device and port */
+   port_priv = ib_get_mad_port(device, port_num);
+   if (!port_priv) {
+   ret = ERR_PTR(-ENODEV);
+   goto error1;
+   }
+
+   if (!port_priv-qp_info[qp_type].qp)
+   return NULL;
+
/* Validate parameters */
qpn = get_spl_qp_index(qp_type);
if (qpn == -1)
@@ -260,13 +270,6 @@ struct ib_mad_agent *ib_register_mad_agent(struct 
ib_device *device,
goto error1;
}
 
-   /* Validate device and port */
-   port_priv = ib_get_mad_port(device, port_num);
-   if (!port_priv) {
-   ret = ERR_PTR(-ENODEV);
-   goto error1;
-   }
-
/* Allocate structures */
mad_agent_priv = kzalloc(sizeof *mad_agent_priv, GFP_KERNEL);
if (!mad_agent_priv) {
@@ -556,6 +559,9 @@ int ib_unregister_mad_agent(struct ib_mad_agent *mad_agent)
struct ib_mad_agent_private *mad_agent_priv;
struct ib_mad_snoop_private *mad_snoop_priv;
 
+   if (!mad_agent)
+   return 0;
+
/* If the TID is zero, the agent can only snoop. */
if (mad_agent-hi_tid) {
mad_agent_priv = container_of(mad_agent,
@@ -2602,6 +2608,9 @@ static void cleanup_recv_queue(struct ib_mad_qp_info 
*qp_info)
struct ib_mad_private *recv;
struct ib_mad_list_head *mad_list;
 
+   if (!qp_info-qp)
+   return;
+
while (!list_empty(qp_info-recv_queue.list)) {
 
mad_list = list_entry(qp_info-recv_queue.list.next,
@@ -2643,6 +2652,9 @@ static int ib_mad_port_start(struct ib_mad_port_private 
*port_priv)
 
for (i = 0; i  IB_MAD_QPS_CORE; i++) {
qp = port_priv-qp_info[i].qp;
+   if (!qp)
+   continue;
+
/*
 * PKey index for QP1 is irrelevant but
 * one is needed for the Reset to Init transition
@@ -2684,6 +2696,9 @@ static int ib_mad_port_start(struct ib_mad_port_private 
*port_priv)
}
 
for (i = 0; i  IB_MAD_QPS_CORE; i++) {
+   if (!port_priv-qp_info[i].qp)
+   continue;
+
ret = ib_mad_post_receive_mads(port_priv-qp_info[i], NULL);
if (ret) {
printk(KERN_ERR PFX Couldn't post receive WRs\n);
@@ -2762,6 +2777,9 @@ error:
 
 static void destroy_mad_qp(struct ib_mad_qp_info *qp_info)
 {
+   if (!qp_info-qp)
+   return;
+
ib_destroy_qp(qp_info-qp);
kfree(qp_info-snoop_table);
 }
@@ -2777,6 +2795,7 @@ static int ib_mad_port_open(struct ib_device *device,
struct ib_mad_port_private *port_priv;
unsigned long flags;
char name[sizeof ib_mad123];
+   int has_smi;
 
/* Create new device info */
port_priv = kzalloc(sizeof

[ofa-general] [PATCH 3/8 v3] ib_core: Add RDMAoE SA support

2009-07-13 Thread Eli Cohen

Add support for resolving paths and joining multicast group for RDMAoE ports.
For Eth links, path resolution will complete immediately but will call the
callback from a workqueue context to avoid deadloks. Multicast joins are
handled in nearly the same way as IB mulitcast joins are handled in
multicast.c. However they are handled entirly at the host and no MADs are
involved. This allows for a client to create groups and dictate the qkey to be
used in that group. The code is put in rdmaoe_sa.c which handles both multicast
joins/leaves and path resolution.

The following files were added:
drivers/infiniband/core/multicast.h
drivers/infiniband/core/rdmaoe_sa.c
include/rdma/rdmaoe_sa.h

There are changes made in vers/infiniband/core/multicast.c,
drivers/infiniband/core/sa_query.c and drivers/infiniband/core/sa.h to allow
sharing of data structs. New API functions are added for RDMAoE and comsumenrs
who want to use this API need to be changed.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/infiniband/core/Makefile|2 +-
 drivers/infiniband/core/multicast.c |   43 +--
 drivers/infiniband/core/multicast.h |   79 +++
 drivers/infiniband/core/rdmaoe_sa.c |  938 +++
 drivers/infiniband/core/sa.h|   24 +
 drivers/infiniband/core/sa_query.c  |   26 +-
 include/rdma/rdmaoe_sa.h|   66 +++
 7 files changed, 1113 insertions(+), 65 deletions(-)
 create mode 100644 drivers/infiniband/core/multicast.h
 create mode 100644 drivers/infiniband/core/rdmaoe_sa.c
 create mode 100644 include/rdma/rdmaoe_sa.h

diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index cb1ab3e..96db705 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -2,7 +2,7 @@ infiniband-$(CONFIG_INFINIBAND_ADDR_TRANS)  := ib_addr.o 
rdma_cm.o
 user_access-$(CONFIG_INFINIBAND_ADDR_TRANS):= rdma_ucm.o
 
 obj-$(CONFIG_INFINIBAND) +=ib_core.o ib_mad.o ib_sa.o \
-   ib_cm.o iw_cm.o $(infiniband-y)
+   ib_cm.o iw_cm.o rdmaoe_sa.o 
$(infiniband-y)
 obj-$(CONFIG_INFINIBAND_USER_MAD) +=   ib_umad.o
 obj-$(CONFIG_INFINIBAND_USER_ACCESS) +=ib_uverbs.o ib_ucm.o \
$(user_access-y)
diff --git a/drivers/infiniband/core/multicast.c 
b/drivers/infiniband/core/multicast.c
index 107f170..727a55a 100644
--- a/drivers/infiniband/core/multicast.c
+++ b/drivers/infiniband/core/multicast.c
@@ -39,6 +39,7 @@
 
 #include rdma/ib_cache.h
 #include sa.h
+#include multicast.h
 
 static void mcast_add_one(struct ib_device *device);
 static void mcast_remove_one(struct ib_device *device);
@@ -72,52 +73,10 @@ struct mcast_device {
struct mcast_port   port[0];
 };
 
-enum mcast_state {
-   MCAST_JOINING,
-   MCAST_MEMBER,
-   MCAST_ERROR,
-};
-
-enum mcast_group_state {
-   MCAST_IDLE,
-   MCAST_BUSY,
-   MCAST_GROUP_ERROR,
-   MCAST_PKEY_EVENT
-};
-
 enum {
MCAST_INVALID_PKEY_INDEX = 0x
 };
 
-struct mcast_member;
-
-struct mcast_group {
-   struct ib_sa_mcmember_rec rec;
-   struct rb_node  node;
-   struct mcast_port   *port;
-   spinlock_t  lock;
-   struct work_struct  work;
-   struct list_headpending_list;
-   struct list_headactive_list;
-   struct mcast_member *last_join;
-   int members[3];
-   atomic_trefcount;
-   enum mcast_group_state  state;
-   struct ib_sa_query  *query;
-   int query_id;
-   u16 pkey_index;
-};
-
-struct mcast_member {
-   struct ib_sa_multicast  multicast;
-   struct ib_sa_client *client;
-   struct mcast_group  *group;
-   struct list_headlist;
-   enum mcast_statestate;
-   atomic_trefcount;
-   struct completion   comp;
-};
-
 static void join_handler(int status, struct ib_sa_mcmember_rec *rec,
 void *context);
 static void leave_handler(int status, struct ib_sa_mcmember_rec *rec,
diff --git a/drivers/infiniband/core/multicast.h 
b/drivers/infiniband/core/multicast.h
new file mode 100644
index 000..17eb9fe
--- /dev/null
+++ b/drivers/infiniband/core/multicast.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ *  - Redistributions

[ofa-general] [PATCH 4/8 v3] ib_core: CMA device binding

2009-07-13 Thread Eli Cohen

Add support for RDMAoE device binding and IP -- GID resolution. Modify calls
to the SA such that the link type is veirifed first and the appropriate call is
made to either IB SA or RDMAoE SA.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/infiniband/core/addr.c |   20 ---
 drivers/infiniband/core/cma.c  |  124 +---
 include/rdma/ib_addr.h |   53 +
 3 files changed, 166 insertions(+), 31 deletions(-)

diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c
index ce511d8..440e613 100644
--- a/drivers/infiniband/core/addr.c
+++ b/drivers/infiniband/core/addr.c
@@ -64,7 +64,7 @@ struct addr_req {
 
 static void process_req(struct work_struct *work);
 
-static DEFINE_MUTEX(lock);
+static DEFINE_SPINLOCK(lock);
 static LIST_HEAD(req_list);
 static DECLARE_DELAYED_WORK(work, process_req);
 static struct workqueue_struct *addr_wq;
@@ -163,7 +163,7 @@ static void queue_req(struct addr_req *req)
 {
struct addr_req *temp_req;
 
-   mutex_lock(lock);
+   spin_lock(lock);
list_for_each_entry_reverse(temp_req, req_list, list) {
if (time_after_eq(req-timeout, temp_req-timeout))
break;
@@ -173,7 +173,7 @@ static void queue_req(struct addr_req *req)
 
if (req_list.next == req-list)
set_timeout(req-timeout);
-   mutex_unlock(lock);
+   spin_unlock(lock);
 }
 
 static void addr_send_arp(struct sockaddr *dst_in)
@@ -207,7 +207,9 @@ static void addr_send_arp(struct sockaddr *dst_in)
if (!dst)
return;
 
-   neigh_event_send(dst-neighbour, NULL);
+   if (dst-neighbour)
+   neigh_event_send(dst-neighbour, NULL);
+
dst_release(dst);
break;
}
@@ -322,7 +324,7 @@ static void process_req(struct work_struct *work)
 
INIT_LIST_HEAD(done_list);
 
-   mutex_lock(lock);
+   spin_lock(lock);
list_for_each_entry_safe(req, temp_req, req_list, list) {
if (req-status == -ENODATA) {
src_in = (struct sockaddr *) req-src_addr;
@@ -341,7 +343,7 @@ static void process_req(struct work_struct *work)
req = list_entry(req_list.next, struct addr_req, list);
set_timeout(req-timeout);
}
-   mutex_unlock(lock);
+   spin_unlock(lock);
 
list_for_each_entry_safe(req, temp_req, done_list, list) {
list_del(req-list);
@@ -439,7 +441,7 @@ int rdma_resolve_ip(struct rdma_addr_client *client,
struct addr_req *req;
int ret = 0;
 
-   req = kzalloc(sizeof *req, GFP_KERNEL);
+   req = kzalloc(sizeof *req, GFP_ATOMIC);
if (!req)
return -ENOMEM;
 
@@ -483,7 +485,7 @@ void rdma_addr_cancel(struct rdma_dev_addr *addr)
 {
struct addr_req *req, *temp_req;
 
-   mutex_lock(lock);
+   spin_lock(lock);
list_for_each_entry_safe(req, temp_req, req_list, list) {
if (req-addr == addr) {
req-status = -ECANCELED;
@@ -493,7 +495,7 @@ void rdma_addr_cancel(struct rdma_dev_addr *addr)
break;
}
}
-   mutex_unlock(lock);
+   spin_unlock(lock);
 }
 EXPORT_SYMBOL(rdma_addr_cancel);
 
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 851de83..6cf0f1b 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -49,6 +49,7 @@
 #include rdma/ib_cache.h
 #include rdma/ib_cm.h
 #include rdma/ib_sa.h
+#include rdma/rdmaoe_sa.h
 #include rdma/iw_cm.h
 
 MODULE_AUTHOR(Sean Hefty);
@@ -69,6 +70,8 @@ static struct ib_client cma_client = {
 };
 
 static struct ib_sa_client sa_client;
+static struct ib_sa_client rdmaoe_sa_client;
+static struct ib_sa_client rdmaoe_mcast_client;
 static struct rdma_addr_client addr_client;
 static LIST_HEAD(dev_list);
 static LIST_HEAD(listen_any_list);
@@ -327,22 +330,30 @@ static int cma_acquire_dev(struct rdma_id_private 
*id_priv)
 {
struct rdma_dev_addr *dev_addr = id_priv-id.route.addr.dev_addr;
struct cma_device *cma_dev;
-   union ib_gid gid;
+   union ib_gid gid, llgid, *tmp;
int ret = -ENODEV;
 
switch (rdma_node_get_transport(dev_addr-dev_type)) {
case RDMA_TRANSPORT_IB:
ib_addr_get_sgid(dev_addr, gid);
+   tmp = gid;
break;
case RDMA_TRANSPORT_IWARP:
iw_addr_get_sgid(dev_addr, gid);
+   rdma_mac_to_ll_addr(dev_addr-src_dev_addr, llgid);
break;
default:
return -ENODEV;
}
 
list_for_each_entry(cma_dev, dev_list, list) {
-   ret = ib_find_cached_gid(cma_dev-device, gid,
+   if (ib_get_port_link_type(cma_dev-device, id_priv-id.port_num)
+   == PORT_LINK_ETH

[ofa-general] [PATCH 5/8 v3] ib_core: RDMAoE UD packet packing support

2009-07-13 Thread Eli Cohen

Add support functions to aid in packing RDMAoE packets.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/infiniband/core/ud_header.c |  111 +++
 include/rdma/ib_pack.h  |   26 
 2 files changed, 137 insertions(+), 0 deletions(-)

diff --git a/drivers/infiniband/core/ud_header.c 
b/drivers/infiniband/core/ud_header.c
index 8ec7876..d04b6f2 100644
--- a/drivers/infiniband/core/ud_header.c
+++ b/drivers/infiniband/core/ud_header.c
@@ -80,6 +80,29 @@ static const struct ib_field lrh_table[]  = {
  .size_bits= 16 }
 };
 
+static const struct ib_field eth_table[]  = {
+   { STRUCT_FIELD(eth, dmac_h),
+ .offset_words = 0,
+ .offset_bits  = 0,
+ .size_bits= 32 },
+   { STRUCT_FIELD(eth, dmac_l),
+ .offset_words = 1,
+ .offset_bits  = 0,
+ .size_bits= 16 },
+   { STRUCT_FIELD(eth, smac_h),
+ .offset_words = 1,
+ .offset_bits  = 16,
+ .size_bits= 16 },
+   { STRUCT_FIELD(eth, smac_l),
+ .offset_words = 2,
+ .offset_bits  = 0,
+ .size_bits= 32 },
+   { STRUCT_FIELD(eth, type),
+ .offset_words = 3,
+ .offset_bits  = 0,
+ .size_bits= 16 }
+};
+
 static const struct ib_field grh_table[]  = {
{ STRUCT_FIELD(grh, ip_version),
  .offset_words = 0,
@@ -241,6 +264,53 @@ void ib_ud_header_init(int 
payload_bytes,
 EXPORT_SYMBOL(ib_ud_header_init);
 
 /**
+ * ib_rdmaoe_ud_header_init - Initialize UD header structure
+ * @payload_bytes:Length of packet payload
+ * @grh_present:GRH flag (if non-zero, GRH will be included)
+ * @header:Structure to initialize
+ *
+ * ib_rdmaoe_ud_header_init() initializes the grh.ip_version, 
grh.payload_length,
+ * grh.next_header, bth.opcode, bth.pad_count and
+ * bth.transport_header_version fields of a struct eth_ud_header given
+ * the payload length and whether a GRH will be included.
+ */
+void ib_rdmaoe_ud_header_init(int  payload_bytes,
+  int  grh_present,
+  struct eth_ud_header*header)
+{
+   int header_len;
+
+   memset(header, 0, sizeof *header);
+
+   header_len =
+   sizeof header-eth  +
+   IB_BTH_BYTES  +
+   IB_DETH_BYTES;
+   if (grh_present)
+   header_len += IB_GRH_BYTES;
+
+   header-grh_present  = grh_present;
+   if (grh_present) {
+   header-grh.ip_version  = 6;
+   header-grh.payload_length  =
+   cpu_to_be16((IB_BTH_BYTES +
+IB_DETH_BYTES+
+payload_bytes+
+4+ /* ICRC */
+3)  ~3);  /* round up */
+   header-grh.next_header = 0x1b;
+   }
+
+   if (header-immediate_present)
+   header-bth.opcode   = 
IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
+   else
+   header-bth.opcode   = IB_OPCODE_UD_SEND_ONLY;
+   header-bth.pad_count= (4 - payload_bytes)  3;
+   header-bth.transport_header_version = 0;
+}
+EXPORT_SYMBOL(ib_rdmaoe_ud_header_init);
+
+/**
  * ib_ud_header_pack - Pack UD header struct into wire format
  * @header:UD header struct
  * @buf:Buffer to pack into
@@ -281,6 +351,47 @@ int ib_ud_header_pack(struct ib_ud_header *header,
 EXPORT_SYMBOL(ib_ud_header_pack);
 
 /**
+ * rdmaoe_ud_header_pack - Pack UD header struct into eth wire format
+ * @header:UD header struct
+ * @buf:Buffer to pack into
+ *
+ * ib_ud_header_pack() packs the UD header structure @header into wire
+ * format in the buffer @buf.
+ */
+int rdmaoe_ud_header_pack(struct eth_ud_header *header,
+  void *buf)
+{
+   int len = 0;
+
+   ib_pack(eth_table, ARRAY_SIZE(eth_table),
+   header-eth, buf);
+   len += IB_ETH_BYTES;
+
+   if (header-grh_present) {
+   ib_pack(grh_table, ARRAY_SIZE(grh_table),
+   header-grh, buf + len);
+   len += IB_GRH_BYTES;
+   }
+
+   ib_pack(bth_table, ARRAY_SIZE(bth_table),
+   header-bth, buf + len);
+   len += IB_BTH_BYTES;
+
+   ib_pack(deth_table, ARRAY_SIZE(deth_table),
+   header-deth, buf + len);
+   len += IB_DETH_BYTES;
+
+   if (header-immediate_present) {
+   memcpy(buf + len, header-immediate_data,
+  sizeof header-immediate_data);
+   len += sizeof header-immediate_data;
+   }
+
+   return len;
+}
+EXPORT_SYMBOL(rdmaoe_ud_header_pack);
+
+/**
  * ib_ud_header_unpack - Unpack UD header struct from wire format
  * @header:UD header struct
  * @buf:Buffer to pack into
diff --git a/include/rdma/ib_pack.h b/include

[ofa-general] [PATCH 6/8 v3] IB/ipoib: restrict IPoIB to work on IB ports only

2009-07-13 Thread Eli Cohen

We don't want IPoIB to work over RDMAoE since it will give worse performance
than working directly on Ethernet interfaces which are a prerequisite to RDMAoE
anyway.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/infiniband/ulp/ipoib/ipoib_main.c |3 +++
 1 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c 
b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index ab2c192..f12884c 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -1228,6 +1228,9 @@ static struct net_device *ipoib_add_port(const char 
*format,
struct ib_port_attr attr;
int result = -ENOMEM;
 
+   if (ib_get_port_link_type(hca, port) != PORT_LINK_IB)
+   return ERR_PTR(-ENODEV);
+
priv = ipoib_intf_alloc(format);
if (!priv)
goto alloc_mem_failed;
-- 
1.6.3.3

___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

[ofa-general] [PATCH 7/8 v3] mlx4: Add support for RDMAoE - address resolution

2009-07-13 Thread Eli Cohen

The following path handles address vectors creation for RDMAoE ports. mlx4 needs
the MAC address of the remote node to include it in the WQE of a UD QP or in
the QP context of connected QPs. Address resolution is done atomically in the
case of a link local address or using service from core/addr.c to do that. mlx4
transport packets were changed too to accomodate for RDMAoE.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/infiniband/hw/mlx4/ah.c  |  228 ++-
 drivers/infiniband/hw/mlx4/mlx4_ib.h |   30 -
 drivers/infiniband/hw/mlx4/qp.c  |  253 +++---
 include/linux/mlx4/device.h  |   31 -
 include/linux/mlx4/qp.h  |8 +-
 5 files changed, 463 insertions(+), 87 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c
index c75ac94..c994e1f 100644
--- a/drivers/infiniband/hw/mlx4/ah.c
+++ b/drivers/infiniband/hw/mlx4/ah.c
@@ -31,63 +31,200 @@
  */
 
 #include mlx4_ib.h
+#include rdma/ib_addr.h
+#include linux/inet.h
+#include linux/string.h
 
-struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
+static struct rdma_addr_client addr_client;
+
+struct resolve_ctx {
+   struct completion done;
+   int status;
+};
+
+
+static int status2err(int status)
 {
-   struct mlx4_dev *dev = to_mdev(pd-device)-dev;
-   struct mlx4_ib_ah *ah;
+   return status; /* TBD */
+}
 
-   ah = kmalloc(sizeof *ah, GFP_ATOMIC);
-   if (!ah)
-   return ERR_PTR(-ENOMEM);
+static void resolve_callback(int status, struct sockaddr *src_addr,
+struct rdma_dev_addr *addr, void *context)
+{
+   struct resolve_ctx *ctx = context;
 
-   memset(ah-av, 0, sizeof ah-av);
+   ctx-status = status;
+   complete(ctx-done);
+}
 
-   ah-av.port_pd = cpu_to_be32(to_mpd(pd)-pdn | (ah_attr-port_num  
24));
-   ah-av.g_slid  = ah_attr-src_path_bits;
-   ah-av.dlid= cpu_to_be16(ah_attr-dlid);
-   if (ah_attr-static_rate) {
-   ah-av.stat_rate = ah_attr-static_rate + MLX4_STAT_RATE_OFFSET;
-   while (ah-av.stat_rate  IB_RATE_2_5_GBPS + 
MLX4_STAT_RATE_OFFSET 
-  !(1  ah-av.stat_rate  dev-caps.stat_rate_support))
-   --ah-av.stat_rate;
+int mlx4_ib_resolve_grh(struct mlx4_ib_dev *dev, const struct ib_ah_attr 
*ah_attr,
+   u8 *mac, int *is_mcast)
+{
+   struct mlx4_ib_rdmaoe *rdmaoe = dev-rdmaoe;
+   struct sockaddr_in6 dst = {0};
+   struct rdma_dev_addr addr;
+   struct resolve_ctx ctx;
+   struct net_device *netdev;
+   int err = 0;
+   int ifidx;
+
+   *is_mcast = 0;
+   spin_lock(rdmaoe-lock);
+   netdev = rdmaoe-netdevs[ah_attr-port_num - 1];
+   if (!netdev) {
+   spin_unlock(rdmaoe-lock);
+   return -EINVAL;
+   }
+   ifidx = netdev-ifindex;
+   spin_unlock(rdmaoe-lock);
+
+   init_completion(ctx.done);
+   memcpy(dst.sin6_addr.s6_addr, ah_attr-grh.dgid.raw, sizeof 
ah_attr-grh);
+   dst.sin6_family = AF_INET6;
+   dst.sin6_scope_id = ifidx;
+   if (rdma_link_local_addr(dst.sin6_addr))
+   rdma_get_ll_mac(dst.sin6_addr, mac);
+   else if (rdma_is_multicast_addr(dst.sin6_addr)) {
+   rdma_get_mcast_mac(dst.sin6_addr, mac);
+   *is_mcast = 1;
+   } else {
+   err = rdma_resolve_ip(addr_client, NULL, (struct sockaddr 
*)dst, addr,
+ 2000, resolve_callback, ctx);
+   if (!err)
+   wait_for_completion(ctx.done);
+   else
+   ctx.status = err;
+
+   err = status2err(ctx.status);
}
-   ah-av.sl_tclass_flowlabel = cpu_to_be32(ah_attr-sl  28);
+
+   return err;
+}
+
+static struct ib_ah *create_ib_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr,
+ struct mlx4_ib_ah *ah)
+{
+   struct mlx4_dev *dev = to_mdev(pd-device)-dev;
+
+   ah-av.ib.port_pd = cpu_to_be32(to_mpd(pd)-pdn | (ah_attr-port_num  
24));
+   ah-av.ib.g_slid  = ah_attr-src_path_bits;
if (ah_attr-ah_flags  IB_AH_GRH) {
-   ah-av.g_slid   |= 0x80;
-   ah-av.gid_index = ah_attr-grh.sgid_index;
-   ah-av.hop_limit = ah_attr-grh.hop_limit;
-   ah-av.sl_tclass_flowlabel |=
+   ah-av.ib.g_slid   |= 0x80;
+   ah-av.ib.gid_index = ah_attr-grh.sgid_index;
+   ah-av.ib.hop_limit = ah_attr-grh.hop_limit;
+   ah-av.ib.sl_tclass_flowlabel |=
cpu_to_be32((ah_attr-grh.traffic_class  20) |
ah_attr-grh.flow_label);
-   memcpy(ah-av.dgid, ah_attr-grh.dgid.raw, 16);
+   memcpy(ah-av.ib.dgid, ah_attr-grh.dgid.raw, 16);
}
 
+   ah-av.ib.dlid

[ofa-general] [PATCH 8/8 v3] mlx4: Add RDMAoE support - allow interfaces to correspond to each other

2009-07-13 Thread Eli Cohen

This patch add support RDMAoE for mlx4. Since mlx4_ib now needs to reference
mlx4_en netdevices, a new mechanism was added. Two new fields were added to
struct mlx4_interface to define a protocol and a get_prot_dev method to
retrieve the corresponding protocol's net device.  An implementation of the new
verb ib_get_port_link_type() - mlx4_ib_get_port_link_type - was added.
mlx4_ib_query_port() has been modified to support eth link types. An interface
is considered to be active if its corresponding eth interface is active. Code
for setting the GID table of a port has been added. Currently, each IB port has
a single GID entry in its table and that GID entery equals the link local IPv6
address.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/infiniband/hw/mlx4/main.c |  276 +
 drivers/net/mlx4/cmd.c|6 +
 drivers/net/mlx4/en_main.c|   15 ++-
 drivers/net/mlx4/en_port.c|4 +-
 drivers/net/mlx4/en_port.h|3 +-
 drivers/net/mlx4/intf.c   |   20 +++
 drivers/net/mlx4/main.c   |6 +
 drivers/net/mlx4/mlx4.h   |1 +
 include/linux/mlx4/cmd.h  |1 +
 include/linux/mlx4/driver.h   |   16 ++-
 10 files changed, 310 insertions(+), 38 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/main.c 
b/drivers/infiniband/hw/mlx4/main.c
index ae3d759..40442b8 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -34,6 +34,7 @@
 #include linux/module.h
 #include linux/init.h
 #include linux/errno.h
+#include linux/netdevice.h
 
 #include rdma/ib_smi.h
 #include rdma/ib_user_verbs.h
@@ -152,28 +153,19 @@ out:
return err;
 }
 
-static int mlx4_ib_query_port(struct ib_device *ibdev, u8 port,
- struct ib_port_attr *props)
+static enum ib_port_link_type mlx4_ib_get_port_link_type(struct ib_device 
*device,
+u8 port_num)
 {
-   struct ib_smp *in_mad  = NULL;
-   struct ib_smp *out_mad = NULL;
-   int err = -ENOMEM;
-
-   in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
-   out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
-   if (!in_mad || !out_mad)
-   goto out;
-
-   memset(props, 0, sizeof *props);
-
-   init_query_mad(in_mad);
-   in_mad-attr_id  = IB_SMP_ATTR_PORT_INFO;
-   in_mad-attr_mod = cpu_to_be32(port);
+   struct mlx4_dev *dev = to_mdev(device)-dev;
 
-   err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, 
out_mad);
-   if (err)
-   goto out;
+   return dev-caps.port_mask  (1  (port_num - 1)) ?
+   PORT_LINK_IB : PORT_LINK_ETH;
+}
 
+static void ib_link_query_port(struct ib_device *ibdev, u8 port,
+  struct ib_port_attr *props,
+  struct ib_smp *out_mad)
+{
props-lid  = be16_to_cpup((__be16 *) (out_mad-data + 16));
props-lmc  = out_mad-data[34]  0x7;
props-sm_lid   = be16_to_cpup((__be16 *) (out_mad-data + 18));
@@ -193,6 +185,67 @@ static int mlx4_ib_query_port(struct ib_device *ibdev, u8 
port,
props-subnet_timeout   = out_mad-data[51]  0x1f;
props-max_vl_num   = out_mad-data[37]  4;
props-init_type_reply  = out_mad-data[41]  4;
+   props-link_type= PORT_LINK_IB;
+}
+
+static void eth_link_query_port(struct ib_device *ibdev, u8 port,
+   struct ib_port_attr *props,
+   struct ib_smp *out_mad)
+{
+   struct mlx4_ib_rdmaoe *rdmaoe = to_mdev(ibdev)-rdmaoe;
+   struct net_device *ndev;
+
+   props-port_cap_flags   = be32_to_cpup((__be32 *) (out_mad-data + 20));
+   props-gid_tbl_len  = to_mdev(ibdev)-dev-caps.gid_table_len[port];
+   props-max_msg_sz   = to_mdev(ibdev)-dev-caps.max_msg_sz;
+   props-pkey_tbl_len = 
to_mdev(ibdev)-dev-caps.pkey_table_len[port];
+   props-bad_pkey_cntr= be16_to_cpup((__be16 *) (out_mad-data + 46));
+   props-qkey_viol_cntr   = be16_to_cpup((__be16 *) (out_mad-data + 48));
+   props-active_width = out_mad-data[31]  0xf;
+   props-active_speed = out_mad-data[35]  4;
+   props-max_mtu  = out_mad-data[41]  0xf;
+   props-active_mtu   = rdmaoe-mtu[port - 1];
+   props-subnet_timeout   = out_mad-data[51]  0x1f;
+   props-max_vl_num   = out_mad-data[37]  4;
+   props-init_type_reply  = out_mad-data[41]  4;
+   props-link_type= PORT_LINK_ETH;
+   spin_lock(rdmaoe-lock);
+   ndev = rdmaoe-netdevs[port - 1];
+   if (!ndev)
+   goto out;
+
+   props-state= netif_running(ndev)   netif_oper_up(ndev) ?
+   IB_PORT_ACTIVE : IB_PORT_DOWN;
+   props-phys_state   = props-state;
+out:
+   spin_unlock(rdmaoe-lock);
+}
+
+static int

RE: [ofa-general] Write combining support in OFED 1.5

2009-07-07 Thread Eli Cohen

In PPC arch HPAGE_SHIFT is an unexported, variable (because huge page
sizes are different there). The huge pages patch needs to reference this
value (through HPAGE_SIZE).

-Original Message-
From: Jack Morgenstein [mailto:ja...@dev.mellanox.co.il] 
Sent: Tuesday, July 07, 2009 2:14 PM
To: Or Gerlitz; Eli Cohen
Cc: rdre...@cisco.com; general@lists.openfabrics.org
Subject: Re: [ofa-general] Write combining support in OFED 1.5

Adding Eli Cohen (author of the huge-page support patch).

Eli, what is missing on the PPC regarding huge page support?

-Jack

On Tuesday 07 July 2009 13:54, Or Gerlitz wrote:
 Jack Morgenstein wrote:
  Yes, see kernel_patches/fixes/mlx4_0010_add_wc.patch. With OFED 1.5,
I am trying to use the kernel-based WC support, and eliminate the patch.
To do this, I need 2 things:
  1. macro pgprot_writecombine to be defined for PPC/PPC64 under the 
  arch directory (file arch/powerpc/include/asm/pgtable.h)
  2. A way of determining (via the standard kernel) if write-combining
is available on a given platform (to allow disabling use of ConnectX
blueflame in userspace if WC is not available).

 It seems that none of the issues are IB specific, correct? As for PPC 
 guys I think we have another open issue with them, regarding huge 
 pages support, a patch which isn't mainline. Maybe you can use the 
 opportunity to set 
 http://lists.openfabrics.org/pipermail/general/2009-January/056812.htm
 l
 as well...

 Or.

___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

[ofa-general] XRC userspace git trees

2009-06-30 Thread Eli Cohen

Roland,
where can I find your userspace trees with XRC support?
___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

[ofa-general] Re: [ewg] Re: [PATCH 2/9] ib_core: kernel API for GID -- MAC translations

2009-06-18 Thread Eli Cohen

On Wed, Jun 17, 2009 at 11:19:04AM -0700, Roland Dreier wrote:
 
   diff --git a/include/rdma/ib_user_verbs.h b/include/rdma/ib_user_verbs.h
   index a17f771..5bfa2e6 100644
   --- a/include/rdma/ib_user_verbs.h
   +++ b/include/rdma/ib_user_verbs.h
   @@ -81,7 +81,8 @@ enum {
  IB_USER_VERBS_CMD_MODIFY_SRQ,
  IB_USER_VERBS_CMD_QUERY_SRQ,
  IB_USER_VERBS_CMD_DESTROY_SRQ,
   -  IB_USER_VERBS_CMD_POST_SRQ_RECV
   +  IB_USER_VERBS_CMD_POST_SRQ_RECV,
   +  IB_USER_VERBS_CMD_GET_MAC
};

 
 This breaks the ABI defined for XRC.  So I guess these patches are not
 really intended to be applied?
 
 I definitely want to finish up XRC before starting to look at this
 series in detail anyway.
 

Roland,
on top of which trees would like me to rebase the patches on and update
the ABI version?
___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Re: [ofa-general] [PATCH] libibverbs: Add RDMAoE support

2009-06-17 Thread Eli Cohen

On Mon, Jun 15, 2009 at 10:32:03AM -0600, Jason Gunthorpe wrote:
 On Mon, Jun 15, 2009 at 04:42:36PM +0300, Eli Cohen wrote:
 
 Please use inet_pton(AF_INET6,grh,gid) for this?
 
I was searching for something like this... Thanks, I'll fix this.


  +   if (ctx-dgid.global.interface_id) {
  +   attr.ah_attr.is_global = 1;
  +   attr.ah_attr.grh.hop_limit = 1; 
  +   attr.ah_attr.grh.dgid = ctx-dgid;
  +   }
 
 We've been working to get rid of stuff like this for the IB router
 work. Can you call into the SA path resolution stuff instead of doing
 this?
 

Please note this is only an example which initially did not use SA
services so I did not change that too.
___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

[ofa-general] Re: [ewg] [PATCH 3/9] ib_core: RDMAoE support only QP1

2009-06-17 Thread Eli Cohen

On Mon, Jun 15, 2009 at 02:26:42PM -0400, Hal Rosenstock wrote:
 Should ib_post_send_mad return some error on QP0 sends on RDMAoE ports ?
You can't send anything over QP0 because it is not created and so
there are no data structs corresponding to it.

 What QP1 sends are allowed ?
Basically, all QP1 sends are allowed without any changes - QP1
functions as normal.
However, RDMAoE will initially support only the CM. In the future, we
can support additional QP1 services.

 Is it only SA sends which are faked ?
Yes

 How
 are others handled ? These questions are important to what happens to
 the IB management/diag tools when invoked on an RDMAoE port. We need
 to be able to handle that scenario cleanly.
You should be able to to send MADs over QP1 from the kernel. I did not
make any tests as for sending MADs from userspace but I can't think of
a reason why this would be a problem.

 
 Is something similar needed in ib_mad_port_close for handling no QP0
 on RDMAoE ports in terms of destroy_mad_qp/cleanup_recv_queue calls ?
 
No, becuase it is handled inside destroy_mad_qp():

if (!qp_info-qp)
return;

___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Re: [ewg] RE: [ofa-general] [PATCH 4/9] ib_core: Add RDMAoE SA support

2009-06-17 Thread Eli Cohen

On Tue, Jun 16, 2009 at 10:27:33AM -0700, Sean Hefty wrote:
 +
 +w-member-multicast.rec.qkey = cpu_to_be32(0xc2c);
 
 How can a user control this?  An app needs the same qkey for unicast traffic.

In RDMAoE, the qkey has a fixed well-known value, which will be
returned both by multicast and path queries.


 
 +atomic_inc(w-member-refcount);
 
 This needs to be moved below...
I don't follow you - please explain.

 +static struct ib_sa_multicast *
 +eth_join_multicast(struct ib_sa_client *client,
 +   struct ib_device *device, u8 port_num,
 +   struct ib_sa_mcmember_rec *rec,
 +   ib_sa_comp_mask comp_mask, gfp_t gfp_mask,
 +   int (*callback)(int status,
 +   struct ib_sa_multicast *multicast),
 +   void *context)
 +{
 +struct mcast_device *dev;
 +struct eth_work *w;
 +struct mcast_member *member;
 +int err;
 +
 +dev = ib_get_client_data(device, mcast_client);
 +if (!dev)
 +return ERR_PTR(-ENODEV);
 +
 +member = kzalloc(sizeof *member, gfp_mask);
 +if (!member)
 +return ERR_PTR(-ENOMEM);
 +
 +w = kzalloc(sizeof *w, gfp_mask);
 +if (!w) {
 +err = -ENOMEM;
 +goto out1;
 +}
 +w-member = member;
 +w-device = device;
 +w-port_num = port_num;
 +
 +member-multicast.context = context;
 +member-multicast.callback = callback;
 +member-client = client;
 +member-multicast.rec.mgid = rec-mgid;
 +init_completion(member-comp);
 +atomic_set(member-refcount, 1);
 +
 +ib_sa_client_get(client);
 +INIT_WORK(w-work, eth_mcast_work_handler);
 +queue_work(mcast_wq, w-work);
 +
 +return member-multicast;
 
 The user could leave/destroy the multicast join request before the queued work
 item runs.  We need to hold an additional reference on the member until the 
 work
 item completes.
Yes, thanks for catching this. I'll fix and resend.

 
 There's substantial differences in functionality between an IB multicast group
 and the Ethernet group.  I would rather see the differences hidden by the
 rdma_cm, than the IB SA module.
 

This is a question of transparency. The motivation is to enable
non-cma apps that use the ib_sa_join_multicast() API to work without
changes.

 diff --git a/drivers/infiniband/core/sa_query.c
 b/drivers/infiniband/core/sa_query.c
 index 1865049..7bf9b5c 100644
 --- a/drivers/infiniband/core/sa_query.c
 +++ b/drivers/infiniband/core/sa_query.c
 @@ -45,6 +45,7 @@
 
 There's not a userspace interface into the sa_query module.  How will those 
 apps
 work, or apps that send MADs on QPs other than QP1?

Currently, these apps won't work. Sending MADs directly on QPs other
than QP1 will not work.
However, we expect that a userpsace interface to the sa_query module
will be implemented in the future, which will forward queries to
the kernel module.
Naturally, most kernel ULPs and user-level apps will use these
standard interfaces instead of implementing MAD queries themselves.
___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Re: [ofa-general] [PATCH 0/9] RDMAoE - RDMA over Ethernety

2009-06-17 Thread Eli Cohen

On Tue, Jun 16, 2009 at 11:16:28AM -0500, Steve Wise wrote:
 Hey Eli,

 Does this series implement UD/multicast support?

 I didn't see it with a quick perusal of the patches.

Hi Steve,
yes UD multicast is supported. Note that in this version of the
pathces I use the broadcast MAC address (all FF's) for all multicast
but this will be changed and GIDs will be mapped to multicast
addresses.
___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

[ofa-general] Re: [ewg] [PATCH 3/9] ib_core: RDMAoE support only QP1

2009-06-17 Thread Eli Cohen

On Wed, Jun 17, 2009 at 07:14:56AM -0400, Hal Rosenstock wrote:
 On Wed, Jun 17, 2009 at 7:10 AM, Eli Cohene...@dev.mellanox.co.il wrote:
  On Mon, Jun 15, 2009 at 02:26:42PM -0400, Hal Rosenstock wrote:
  Should ib_post_send_mad return some error on QP0 sends on RDMAoE ports ?
  You can't send anything over QP0 because it is not created and so
  there are no data structs corresponding to it.
 
 Yes, I understand that's the intention but I didn't see where a MAD
 posted to QP0 returns an error. Does that occur ? Or is it just
 silently dropped ?
But you don't have a struct ib_qp * for QP0 that you could use to
post MADs to QP0...


 
  What QP1 sends are allowed ?
  Basically, all QP1 sends are allowed without any changes - QP1
  functions as normal.
  However, RDMAoE will initially support only the CM. In the future, we
  can support additional QP1 services.
 
 So what happens with other QP1 sends now ? Do they go into hyperspace
 and then timeout ?
SA joins and SA path queries are terminated at the driver. Otherwise,
post sends on QP1 should be sent on the wire.

 
 There are a set of tools (infiniband-diags and ibutils) which do send
 MADs from userspace. I'm concerned that if someone tries these, the
 right thing will happen.
 
What exactly do you mean?
___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

[ofa-general] [PATCH 0/9] RDMAoE - RDMA over Ethernet

2009-06-15 Thread Eli Cohen

RDMA over Ethernet (RDMAoE) allows running the IB transport protocol over
Ethernet, providing IB capabilities for Ethernet fabrics. The packets are
standard Ethernet frames with an Ethertype, an IB GRH,  unmodified IB transport
headers and payload. HCA RDMAoE ports are no different than regular IB ports
from the RDMA stack perspective.

IB subnet management and SA services are not required for RDMAoE operation;
Ethernet management practices are used instead. In Ethernet, nodes are commonly
referred to by applications by means of an IP address. RDMAoE treats IP
addresses that were assigned to the corresponding Ethernet port as GIDs, and
makes use of the IP stack to bind a destination address to the corresponding
netdevice (just as the CMA does today for IB and iWARP) and to obtain its L2
MAC addresses.

The RDMA Verbs API is syntactically unmodified. When referring to RDMAoE ports,
Address handles are required to contain GIDs and the L2 address fields in the
API are ignored. The Ethernet L2 information is then obtained by the
vendor-specific driver (both in kernel- and user-space) while modifying QPs to
RTR and creating address handles.

In order to maintain application compatibility, RDMAoE implements a SA_Query
API that locally returns path records with the corresponding GIDs and the other
relevant parameters . Consequently, any CMA or native Verbs application, in
kernel or user-space, that uses path queries to obtain its address information,
will run transparently over RDMAoE with no changes. We have successfully tested
MPI, SDP, RDS, and native Verbs applications over RDMAoE without *any* changes.

In the mlx4 driver stack, mlx4_en must be loaded and the corresponding eth
Ethernet (with support for RDMAoE) or IB, as it was already the case.

Following is a series of 9 patches based on version 2.6.30 of the
Linux kernel.

Signed-off-by: Eli Cohen e...@mellanox.co.il


 drivers/infiniband/core/addr.c|   20 ++-
 drivers/infiniband/core/agent.c   |   16 +-
 drivers/infiniband/core/cma.c |   39 -
 drivers/infiniband/core/mad.c |   48 --
 drivers/infiniband/core/multicast.c   |  153 ++--
 drivers/infiniband/core/sa_query.c|  167 ++
 drivers/infiniband/core/ud_header.c   |  111 
 drivers/infiniband/core/uverbs.h  |1 +
 drivers/infiniband/core/uverbs_cmd.c  |   34 
 drivers/infiniband/core/uverbs_main.c |1 +
 drivers/infiniband/core/verbs.c   |   18 ++
 drivers/infiniband/hw/mlx4/ah.c   |  228 
 drivers/infiniband/hw/mlx4/main.c |  276 ++---
 drivers/infiniband/hw/mlx4/mlx4_ib.h  |   30 +++-
 drivers/infiniband/hw/mlx4/qp.c   |  253 +-
 drivers/infiniband/ulp/ipoib/ipoib_main.c |3 +
 drivers/net/mlx4/cmd.c|6 +
 drivers/net/mlx4/en_main.c|   15 ++-
 drivers/net/mlx4/en_port.c|4 +-
 drivers/net/mlx4/en_port.h|3 +-
 drivers/net/mlx4/intf.c   |   20 ++
 drivers/net/mlx4/main.c   |6 +
 drivers/net/mlx4/mlx4.h   |1 +
 include/linux/mlx4/cmd.h  |1 +
 include/linux/mlx4/device.h   |   31 +++-
 include/linux/mlx4/driver.h   |   16 ++-
 include/linux/mlx4/qp.h   |8 +-
 include/rdma/ib_addr.h|   51 ++
 include/rdma/ib_pack.h|   26 +++
 include/rdma/ib_user_verbs.h  |   21 ++-
 include/rdma/ib_verbs.h   |   22 +++
 31 files changed, 1419 insertions(+), 210 deletions(-)
___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

[ofa-general] [PATCH 1/9] ib_core: Add API to query port link type

2009-06-15 Thread Eli Cohen

This allows to get the type of a port to be either Ethernet or IB which is
required by following patches for implementing RDMA over Ethernet - RDMAoE.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/infiniband/core/verbs.c |7 +++
 include/rdma/ib_verbs.h |   11 +++
 2 files changed, 18 insertions(+), 0 deletions(-)

diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index a7da9be..18631b1 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -94,6 +94,13 @@ rdma_node_get_transport(enum rdma_node_type node_type)
 }
 EXPORT_SYMBOL(rdma_node_get_transport);
 
+enum ib_port_link_type ib_get_port_link_type(struct ib_device *device, u8 
port_num)
+{
+   return device-get_port_link_type ?
+   device-get_port_link_type(device, port_num) : PORT_LINK_IB;
+}
+EXPORT_SYMBOL(ib_get_port_link_type);
+
 /* Protection domains */
 
 struct ib_pd *ib_alloc_pd(struct ib_device *device)
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index c179318..8f98c8f 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -72,6 +72,11 @@ enum rdma_transport_type {
RDMA_TRANSPORT_IWARP
 };
 
+enum ib_port_link_type {
+   PORT_LINK_IB,
+   PORT_LINK_ETH
+};
+
 enum rdma_transport_type
 rdma_node_get_transport(enum rdma_node_type node_type) __attribute_const__;
 
@@ -298,6 +303,7 @@ struct ib_port_attr {
u8  active_width;
u8  active_speed;
u8  phys_state;
+   enum ib_port_link_type  link_type;
 };
 
 enum ib_device_modify_flags {
@@ -1003,6 +1009,8 @@ struct ib_device {
int(*query_port)(struct ib_device *device,
 u8 port_num,
 struct ib_port_attr 
*port_attr);
+   enum ib_port_link_type (*get_port_link_type)(struct ib_device 
*device,
+u8 port_num);
int(*query_gid)(struct ib_device *device,
u8 port_num, int index,
union ib_gid *gid);
@@ -1213,6 +1221,9 @@ int ib_query_device(struct ib_device *device,
 int ib_query_port(struct ib_device *device,
  u8 port_num, struct ib_port_attr *port_attr);
 
+enum ib_port_link_type ib_get_port_link_type(struct ib_device *device,
+u8 port_num);
+
 int ib_query_gid(struct ib_device *device,
 u8 port_num, int index, union ib_gid *gid);
 
-- 
1.6.3.1

___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

[ofa-general] [PATCH 2/9] ib_core: kernel API for GID -- MAC translations

2009-06-15 Thread Eli Cohen

A few support functions are added to allow the translation from GID to MAC
which is required by hw drivers supporting RDMAoE.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/infiniband/core/uverbs.h  |1 +
 drivers/infiniband/core/uverbs_cmd.c  |   34 +
 drivers/infiniband/core/uverbs_main.c |1 +
 drivers/infiniband/core/verbs.c   |   11 ++
 include/rdma/ib_user_verbs.h  |   21 +--
 include/rdma/ib_verbs.h   |   11 ++
 6 files changed, 76 insertions(+), 3 deletions(-)

diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index b3ea958..e69b04c 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -194,5 +194,6 @@ IB_UVERBS_DECLARE_CMD(create_srq);
 IB_UVERBS_DECLARE_CMD(modify_srq);
 IB_UVERBS_DECLARE_CMD(query_srq);
 IB_UVERBS_DECLARE_CMD(destroy_srq);
+IB_UVERBS_DECLARE_CMD(get_mac);
 
 #endif /* UVERBS_H */
diff --git a/drivers/infiniband/core/uverbs_cmd.c 
b/drivers/infiniband/core/uverbs_cmd.c
index 56feab6..0f1821d 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -452,6 +452,7 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
resp.active_width= attr.active_width;
resp.active_speed= attr.active_speed;
resp.phys_state  = attr.phys_state;
+   resp.link_type   = attr.link_type;
 
if (copy_to_user((void __user *) (unsigned long) cmd.response,
 resp, sizeof resp))
@@ -1824,6 +1825,39 @@ err:
return ret;
 }
 
+ssize_t ib_uverbs_get_mac(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+   struct ib_uverbs_get_maccmd;
+   struct ib_uverbs_get_mac_resp   resp;
+   int  ret;
+   struct ib_pd*pd;
+
+   if (out_len  sizeof resp)
+   return -ENOSPC;
+
+   if (copy_from_user(cmd, buf, sizeof cmd))
+   return -EFAULT;
+
+   pd = idr_read_pd(cmd.pd_handle, file-ucontext);
+   if (!pd)
+   return -EINVAL;
+
+   ret = ib_get_mac(pd-device, cmd.port, cmd.gid, resp.mac);
+   put_pd_read(pd);
+   if (!ret) {
+   if (copy_to_user((void __user *) (unsigned long) cmd.response,
+resp, sizeof resp)) {
+   return -EFAULT;
+   }
+   return in_len;
+   }
+
+
+   return ret;
+}
+
 ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file,
 const char __user *buf, int in_len, int out_len)
 {
diff --git a/drivers/infiniband/core/uverbs_main.c 
b/drivers/infiniband/core/uverbs_main.c
index eb36a81..b2f148f 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -108,6 +108,7 @@ static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file 
*file,
[IB_USER_VERBS_CMD_MODIFY_SRQ]  = ib_uverbs_modify_srq,
[IB_USER_VERBS_CMD_QUERY_SRQ]   = ib_uverbs_query_srq,
[IB_USER_VERBS_CMD_DESTROY_SRQ] = ib_uverbs_destroy_srq,
+   [IB_USER_VERBS_CMD_GET_MAC] = ib_uverbs_get_mac
 };
 
 static struct vfsmount *uverbs_event_mnt;
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 18631b1..e3600c7 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -911,3 +911,14 @@ int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, 
u16 lid)
return qp-device-detach_mcast(qp, gid, lid);
 }
 EXPORT_SYMBOL(ib_detach_mcast);
+
+
+int ib_get_mac(struct ib_device *device, u8 port, u8 *gid, u8 *mac)
+{
+   if (!device-get_mac)
+   return -ENOSYS;
+
+   return device-get_mac(device, port, gid, mac);
+}
+EXPORT_SYMBOL(ib_get_mac);
+
diff --git a/include/rdma/ib_user_verbs.h b/include/rdma/ib_user_verbs.h
index a17f771..5bfa2e6 100644
--- a/include/rdma/ib_user_verbs.h
+++ b/include/rdma/ib_user_verbs.h
@@ -81,7 +81,8 @@ enum {
IB_USER_VERBS_CMD_MODIFY_SRQ,
IB_USER_VERBS_CMD_QUERY_SRQ,
IB_USER_VERBS_CMD_DESTROY_SRQ,
-   IB_USER_VERBS_CMD_POST_SRQ_RECV
+   IB_USER_VERBS_CMD_POST_SRQ_RECV,
+   IB_USER_VERBS_CMD_GET_MAC
 };
 
 /*
@@ -204,8 +205,9 @@ struct ib_uverbs_query_port_resp {
__u8  init_type_reply;
__u8  active_width;
__u8  active_speed;
-   __u8  phys_state;
-   __u8  reserved[3];
+   __u8  phys_state;
+   __u8  link_type;
+   __u8  reserved[2];
 };
 
 struct ib_uverbs_alloc_pd {
@@ -621,6 +623,19 @@ struct ib_uverbs_destroy_ah {
__u32 ah_handle;
 };
 
+struct ib_uverbs_get_mac {
+   __u64 response;
+   __u32 pd_handle;
+   __u8  port;
+   __u8  reserved[3];
+   __u8  gid[16];
+};
+
+struct ib_uverbs_get_mac_resp {
+   __u8mac[6

[ofa-general] [PATCH 3/9] ib_core: RDMAoE support only QP1

2009-06-15 Thread Eli Cohen

Since RDMAoE is using Ethernet there is no need for QP0. This patch will create
only QP1 for RDMAoE ports.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/infiniband/core/agent.c |   16 -
 drivers/infiniband/core/mad.c   |   48 ++-
 2 files changed, 47 insertions(+), 17 deletions(-)

diff --git a/drivers/infiniband/core/agent.c b/drivers/infiniband/core/agent.c
index ae7c288..658a278 100644
--- a/drivers/infiniband/core/agent.c
+++ b/drivers/infiniband/core/agent.c
@@ -46,8 +46,10 @@
 #define SPFX ib_agent: 
 
 struct ib_agent_port_private {
-   struct list_head port_list;
-   struct ib_mad_agent *agent[2];
+   struct list_headport_list;
+   struct ib_mad_agent*agent[2];
+   struct ib_device   *device;
+   u8  port_num;
 };
 
 static DEFINE_SPINLOCK(ib_agent_port_list_lock);
@@ -58,11 +60,10 @@ __ib_get_agent_port(struct ib_device *device, int port_num)
 {
struct ib_agent_port_private *entry;
 
-   list_for_each_entry(entry, ib_agent_port_list, port_list) {
-   if (entry-agent[0]-device == device 
-   entry-agent[0]-port_num == port_num)
+   list_for_each_entry(entry, ib_agent_port_list, port_list)
+   if (entry-device == device  entry-port_num == port_num)
return entry;
-   }
+
return NULL;
 }
 
@@ -175,6 +176,9 @@ int ib_agent_port_open(struct ib_device *device, int 
port_num)
goto error3;
}
 
+   port_priv-device = device;
+   port_priv-port_num = port_num;
+
spin_lock_irqsave(ib_agent_port_list_lock, flags);
list_add_tail(port_priv-port_list, ib_agent_port_list);
spin_unlock_irqrestore(ib_agent_port_list_lock, flags);
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index de922a0..aadf396 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -199,6 +199,16 @@ struct ib_mad_agent *ib_register_mad_agent(struct 
ib_device *device,
unsigned long flags;
u8 mgmt_class, vclass;
 
+   /* Validate device and port */
+   port_priv = ib_get_mad_port(device, port_num);
+   if (!port_priv) {
+   ret = ERR_PTR(-ENODEV);
+   goto error1;
+   }
+
+   if (!port_priv-qp_info[qp_type].qp)
+   return NULL;
+
/* Validate parameters */
qpn = get_spl_qp_index(qp_type);
if (qpn == -1)
@@ -260,13 +270,6 @@ struct ib_mad_agent *ib_register_mad_agent(struct 
ib_device *device,
goto error1;
}
 
-   /* Validate device and port */
-   port_priv = ib_get_mad_port(device, port_num);
-   if (!port_priv) {
-   ret = ERR_PTR(-ENODEV);
-   goto error1;
-   }
-
/* Allocate structures */
mad_agent_priv = kzalloc(sizeof *mad_agent_priv, GFP_KERNEL);
if (!mad_agent_priv) {
@@ -556,6 +559,9 @@ int ib_unregister_mad_agent(struct ib_mad_agent *mad_agent)
struct ib_mad_agent_private *mad_agent_priv;
struct ib_mad_snoop_private *mad_snoop_priv;
 
+   if (!mad_agent)
+   return 0;
+
/* If the TID is zero, the agent can only snoop. */
if (mad_agent-hi_tid) {
mad_agent_priv = container_of(mad_agent,
@@ -2602,6 +2608,9 @@ static void cleanup_recv_queue(struct ib_mad_qp_info 
*qp_info)
struct ib_mad_private *recv;
struct ib_mad_list_head *mad_list;
 
+   if (!qp_info-qp)
+   return;
+
while (!list_empty(qp_info-recv_queue.list)) {
 
mad_list = list_entry(qp_info-recv_queue.list.next,
@@ -2643,6 +2652,9 @@ static int ib_mad_port_start(struct ib_mad_port_private 
*port_priv)
 
for (i = 0; i  IB_MAD_QPS_CORE; i++) {
qp = port_priv-qp_info[i].qp;
+   if (!qp)
+   continue;
+
/*
 * PKey index for QP1 is irrelevant but
 * one is needed for the Reset to Init transition
@@ -2684,6 +2696,9 @@ static int ib_mad_port_start(struct ib_mad_port_private 
*port_priv)
}
 
for (i = 0; i  IB_MAD_QPS_CORE; i++) {
+   if (!port_priv-qp_info[i].qp)
+   continue;
+
ret = ib_mad_post_receive_mads(port_priv-qp_info[i], NULL);
if (ret) {
printk(KERN_ERR PFX Couldn't post receive WRs\n);
@@ -2762,6 +2777,9 @@ error:
 
 static void destroy_mad_qp(struct ib_mad_qp_info *qp_info)
 {
+   if (!qp_info-qp)
+   return;
+
ib_destroy_qp(qp_info-qp);
kfree(qp_info-snoop_table);
 }
@@ -2777,6 +2795,7 @@ static int ib_mad_port_open(struct ib_device *device,
struct ib_mad_port_private *port_priv;
unsigned long flags;
char name[sizeof ib_mad123];
+   int has_smi;
 
/* Create new device info

[ofa-general] [PATCH 4/9] ib_core: Add RDMAoE SA support

2009-06-15 Thread Eli Cohen

Add support for resolving paths and joining multicast group for RDMAoE ports.
The Ethernet specific code will complete immediately but will call the callback
from a workqueue context to avoid deadloks.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/infiniband/core/multicast.c |  153 
 drivers/infiniband/core/sa_query.c  |  167 +++---
 2 files changed, 269 insertions(+), 51 deletions(-)

diff --git a/drivers/infiniband/core/multicast.c 
b/drivers/infiniband/core/multicast.c
index 107f170..2417f6b 100644
--- a/drivers/infiniband/core/multicast.c
+++ b/drivers/infiniband/core/multicast.c
@@ -488,6 +488,36 @@ retest:
}
 }
 
+struct eth_work {
+   struct work_struct   work;
+   struct mcast_member *member;
+   struct ib_device*device;
+   u8   port_num;
+};
+
+static void eth_mcast_work_handler(struct work_struct *work)
+{
+   struct eth_work *w = container_of(work, struct eth_work, work);
+   int err;
+   struct ib_port_attr port_attr;
+   int status = 0;
+
+   err = ib_query_port(w-device, w-port_num, port_attr);
+   if (err)
+   status = err;
+   else if (port_attr.state != IB_PORT_ACTIVE)
+   status = -EAGAIN;
+
+   w-member-multicast.rec.qkey = cpu_to_be32(0xc2c);
+   atomic_inc(w-member-refcount);
+   err = w-member-multicast.callback(status, w-member-multicast);
+   deref_member(w-member);
+   if (err)
+   ib_sa_free_multicast(w-member-multicast);
+
+   kfree(w);
+}
+
 /*
  * Fail a join request if it is still active - at the head of the pending 
queue.
  */
@@ -586,21 +616,14 @@ found:
return group;
 }
 
-/*
- * We serialize all join requests to a single group to make our lives much
- * easier.  Otherwise, two users could try to join the same group
- * simultaneously, with different configurations, one could leave while the
- * join is in progress, etc., which makes locking around error recovery
- * difficult.
- */
-struct ib_sa_multicast *
-ib_sa_join_multicast(struct ib_sa_client *client,
-struct ib_device *device, u8 port_num,
-struct ib_sa_mcmember_rec *rec,
-ib_sa_comp_mask comp_mask, gfp_t gfp_mask,
-int (*callback)(int status,
-struct ib_sa_multicast *multicast),
-void *context)
+static struct ib_sa_multicast *
+ib_join_multicast(struct ib_sa_client *client,
+ struct ib_device *device, u8 port_num,
+ struct ib_sa_mcmember_rec *rec,
+ ib_sa_comp_mask comp_mask, gfp_t gfp_mask,
+ int (*callback)(int status,
+ struct ib_sa_multicast *multicast),
+ void *context)
 {
struct mcast_device *dev;
struct mcast_member *member;
@@ -647,9 +670,81 @@ err:
kfree(member);
return ERR_PTR(ret);
 }
+
+static struct ib_sa_multicast *
+eth_join_multicast(struct ib_sa_client *client,
+  struct ib_device *device, u8 port_num,
+  struct ib_sa_mcmember_rec *rec,
+  ib_sa_comp_mask comp_mask, gfp_t gfp_mask,
+  int (*callback)(int status,
+  struct ib_sa_multicast *multicast),
+  void *context)
+{
+   struct mcast_device *dev;
+   struct eth_work *w;
+   struct mcast_member *member;
+   int err;
+
+   dev = ib_get_client_data(device, mcast_client);
+   if (!dev)
+   return ERR_PTR(-ENODEV);
+
+   member = kzalloc(sizeof *member, gfp_mask);
+   if (!member)
+   return ERR_PTR(-ENOMEM);
+
+   w = kzalloc(sizeof *w, gfp_mask);
+   if (!w) {
+   err = -ENOMEM;
+   goto out1;
+   }
+   w-member = member;
+   w-device = device;
+   w-port_num = port_num;
+
+   member-multicast.context = context;
+   member-multicast.callback = callback;
+   member-client = client;
+   member-multicast.rec.mgid = rec-mgid;
+   init_completion(member-comp);
+   atomic_set(member-refcount, 1);
+
+   ib_sa_client_get(client);
+   INIT_WORK(w-work, eth_mcast_work_handler);
+   queue_work(mcast_wq, w-work);
+
+   return member-multicast;
+
+out1:
+   kfree(member);
+   return ERR_PTR(err);
+}
+
+/*
+ * We serialize all join requests to a single group to make our lives much
+ * easier.  Otherwise, two users could try to join the same group
+ * simultaneously, with different configurations, one could leave while the
+ * join is in progress, etc., which makes locking around error recovery
+ * difficult.
+ */
+struct ib_sa_multicast *
+ib_sa_join_multicast(struct ib_sa_client *client,
+struct ib_device *device, u8 port_num,
+struct ib_sa_mcmember_rec *rec

[ofa-general] [PATCH 5/9] ib_core: CMA device binding

2009-06-15 Thread Eli Cohen

Add support for RDMAoE device binding and IP -- GID resolution.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/infiniband/core/addr.c |   20 ---
 drivers/infiniband/core/cma.c  |   39 ++
 include/rdma/ib_addr.h |   51 
 3 files changed, 96 insertions(+), 14 deletions(-)

diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c
index ce511d8..440e613 100644
--- a/drivers/infiniband/core/addr.c
+++ b/drivers/infiniband/core/addr.c
@@ -64,7 +64,7 @@ struct addr_req {
 
 static void process_req(struct work_struct *work);
 
-static DEFINE_MUTEX(lock);
+static DEFINE_SPINLOCK(lock);
 static LIST_HEAD(req_list);
 static DECLARE_DELAYED_WORK(work, process_req);
 static struct workqueue_struct *addr_wq;
@@ -163,7 +163,7 @@ static void queue_req(struct addr_req *req)
 {
struct addr_req *temp_req;
 
-   mutex_lock(lock);
+   spin_lock(lock);
list_for_each_entry_reverse(temp_req, req_list, list) {
if (time_after_eq(req-timeout, temp_req-timeout))
break;
@@ -173,7 +173,7 @@ static void queue_req(struct addr_req *req)
 
if (req_list.next == req-list)
set_timeout(req-timeout);
-   mutex_unlock(lock);
+   spin_unlock(lock);
 }
 
 static void addr_send_arp(struct sockaddr *dst_in)
@@ -207,7 +207,9 @@ static void addr_send_arp(struct sockaddr *dst_in)
if (!dst)
return;
 
-   neigh_event_send(dst-neighbour, NULL);
+   if (dst-neighbour)
+   neigh_event_send(dst-neighbour, NULL);
+
dst_release(dst);
break;
}
@@ -322,7 +324,7 @@ static void process_req(struct work_struct *work)
 
INIT_LIST_HEAD(done_list);
 
-   mutex_lock(lock);
+   spin_lock(lock);
list_for_each_entry_safe(req, temp_req, req_list, list) {
if (req-status == -ENODATA) {
src_in = (struct sockaddr *) req-src_addr;
@@ -341,7 +343,7 @@ static void process_req(struct work_struct *work)
req = list_entry(req_list.next, struct addr_req, list);
set_timeout(req-timeout);
}
-   mutex_unlock(lock);
+   spin_unlock(lock);
 
list_for_each_entry_safe(req, temp_req, done_list, list) {
list_del(req-list);
@@ -439,7 +441,7 @@ int rdma_resolve_ip(struct rdma_addr_client *client,
struct addr_req *req;
int ret = 0;
 
-   req = kzalloc(sizeof *req, GFP_KERNEL);
+   req = kzalloc(sizeof *req, GFP_ATOMIC);
if (!req)
return -ENOMEM;
 
@@ -483,7 +485,7 @@ void rdma_addr_cancel(struct rdma_dev_addr *addr)
 {
struct addr_req *req, *temp_req;
 
-   mutex_lock(lock);
+   spin_lock(lock);
list_for_each_entry_safe(req, temp_req, req_list, list) {
if (req-addr == addr) {
req-status = -ECANCELED;
@@ -493,7 +495,7 @@ void rdma_addr_cancel(struct rdma_dev_addr *addr)
break;
}
}
-   mutex_unlock(lock);
+   spin_unlock(lock);
 }
 EXPORT_SYMBOL(rdma_addr_cancel);
 
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 851de83..0f15d9d 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -327,22 +327,30 @@ static int cma_acquire_dev(struct rdma_id_private 
*id_priv)
 {
struct rdma_dev_addr *dev_addr = id_priv-id.route.addr.dev_addr;
struct cma_device *cma_dev;
-   union ib_gid gid;
+   union ib_gid gid, llgid, *tmp;
int ret = -ENODEV;
 
switch (rdma_node_get_transport(dev_addr-dev_type)) {
case RDMA_TRANSPORT_IB:
ib_addr_get_sgid(dev_addr, gid);
+   tmp = gid;
break;
case RDMA_TRANSPORT_IWARP:
iw_addr_get_sgid(dev_addr, gid);
+   rdma_mac_to_ll_addr(dev_addr-src_dev_addr, llgid);
break;
default:
return -ENODEV;
}
 
list_for_each_entry(cma_dev, dev_list, list) {
-   ret = ib_find_cached_gid(cma_dev-device, gid,
+   if (ib_get_port_link_type(cma_dev-device, id_priv-id.port_num)
+   == PORT_LINK_ETH 
+   rdma_node_get_transport(dev_addr-dev_type) ==
+   RDMA_TRANSPORT_IWARP)
+   tmp = llgid;
+
+   ret = ib_find_cached_gid(cma_dev-device, tmp,
 id_priv-id.port_num, NULL);
if (!ret) {
cma_attach_to_dev(id_priv, cma_dev);
@@ -1032,7 +1040,19 @@ static struct rdma_id_private *cma_new_conn_id(struct 
rdma_cm_id *listen_id,
if (rt-num_paths == 2)
rt-path_rec[1] = *ib_event-param.req_rcvd.alternate_path;
 
-   ib_addr_set_dgid(rt

[ofa-general] [PATCH 6/9] ib_core: RDMAoE UD packet packing support

2009-06-15 Thread Eli Cohen

Add support functions to aid in packing RDMAoE packets.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/infiniband/core/ud_header.c |  111 +++
 include/rdma/ib_pack.h  |   26 
 2 files changed, 137 insertions(+), 0 deletions(-)

diff --git a/drivers/infiniband/core/ud_header.c 
b/drivers/infiniband/core/ud_header.c
index 8ec7876..d04b6f2 100644
--- a/drivers/infiniband/core/ud_header.c
+++ b/drivers/infiniband/core/ud_header.c
@@ -80,6 +80,29 @@ static const struct ib_field lrh_table[]  = {
  .size_bits= 16 }
 };
 
+static const struct ib_field eth_table[]  = {
+   { STRUCT_FIELD(eth, dmac_h),
+ .offset_words = 0,
+ .offset_bits  = 0,
+ .size_bits= 32 },
+   { STRUCT_FIELD(eth, dmac_l),
+ .offset_words = 1,
+ .offset_bits  = 0,
+ .size_bits= 16 },
+   { STRUCT_FIELD(eth, smac_h),
+ .offset_words = 1,
+ .offset_bits  = 16,
+ .size_bits= 16 },
+   { STRUCT_FIELD(eth, smac_l),
+ .offset_words = 2,
+ .offset_bits  = 0,
+ .size_bits= 32 },
+   { STRUCT_FIELD(eth, type),
+ .offset_words = 3,
+ .offset_bits  = 0,
+ .size_bits= 16 }
+};
+
 static const struct ib_field grh_table[]  = {
{ STRUCT_FIELD(grh, ip_version),
  .offset_words = 0,
@@ -241,6 +264,53 @@ void ib_ud_header_init(int 
payload_bytes,
 EXPORT_SYMBOL(ib_ud_header_init);
 
 /**
+ * ib_rdmaoe_ud_header_init - Initialize UD header structure
+ * @payload_bytes:Length of packet payload
+ * @grh_present:GRH flag (if non-zero, GRH will be included)
+ * @header:Structure to initialize
+ *
+ * ib_rdmaoe_ud_header_init() initializes the grh.ip_version, 
grh.payload_length,
+ * grh.next_header, bth.opcode, bth.pad_count and
+ * bth.transport_header_version fields of a struct eth_ud_header given
+ * the payload length and whether a GRH will be included.
+ */
+void ib_rdmaoe_ud_header_init(int  payload_bytes,
+  int  grh_present,
+  struct eth_ud_header*header)
+{
+   int header_len;
+
+   memset(header, 0, sizeof *header);
+
+   header_len =
+   sizeof header-eth  +
+   IB_BTH_BYTES  +
+   IB_DETH_BYTES;
+   if (grh_present)
+   header_len += IB_GRH_BYTES;
+
+   header-grh_present  = grh_present;
+   if (grh_present) {
+   header-grh.ip_version  = 6;
+   header-grh.payload_length  =
+   cpu_to_be16((IB_BTH_BYTES +
+IB_DETH_BYTES+
+payload_bytes+
+4+ /* ICRC */
+3)  ~3);  /* round up */
+   header-grh.next_header = 0x1b;
+   }
+
+   if (header-immediate_present)
+   header-bth.opcode   = 
IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
+   else
+   header-bth.opcode   = IB_OPCODE_UD_SEND_ONLY;
+   header-bth.pad_count= (4 - payload_bytes)  3;
+   header-bth.transport_header_version = 0;
+}
+EXPORT_SYMBOL(ib_rdmaoe_ud_header_init);
+
+/**
  * ib_ud_header_pack - Pack UD header struct into wire format
  * @header:UD header struct
  * @buf:Buffer to pack into
@@ -281,6 +351,47 @@ int ib_ud_header_pack(struct ib_ud_header *header,
 EXPORT_SYMBOL(ib_ud_header_pack);
 
 /**
+ * rdmaoe_ud_header_pack - Pack UD header struct into eth wire format
+ * @header:UD header struct
+ * @buf:Buffer to pack into
+ *
+ * ib_ud_header_pack() packs the UD header structure @header into wire
+ * format in the buffer @buf.
+ */
+int rdmaoe_ud_header_pack(struct eth_ud_header *header,
+  void *buf)
+{
+   int len = 0;
+
+   ib_pack(eth_table, ARRAY_SIZE(eth_table),
+   header-eth, buf);
+   len += IB_ETH_BYTES;
+
+   if (header-grh_present) {
+   ib_pack(grh_table, ARRAY_SIZE(grh_table),
+   header-grh, buf + len);
+   len += IB_GRH_BYTES;
+   }
+
+   ib_pack(bth_table, ARRAY_SIZE(bth_table),
+   header-bth, buf + len);
+   len += IB_BTH_BYTES;
+
+   ib_pack(deth_table, ARRAY_SIZE(deth_table),
+   header-deth, buf + len);
+   len += IB_DETH_BYTES;
+
+   if (header-immediate_present) {
+   memcpy(buf + len, header-immediate_data,
+  sizeof header-immediate_data);
+   len += sizeof header-immediate_data;
+   }
+
+   return len;
+}
+EXPORT_SYMBOL(rdmaoe_ud_header_pack);
+
+/**
  * ib_ud_header_unpack - Unpack UD header struct from wire format
  * @header:UD header struct
  * @buf:Buffer to pack into
diff --git a/include/rdma/ib_pack.h b/include

[ofa-general] [PATCH 8/9] mlx4: Add support for RDMAoE - address resolution

2009-06-15 Thread Eli Cohen

The following path handles address vectors creation for RDMAoE ports. mlx4 needs
the MAC address of the remote node to include it in the WQE of a UD QP or in
the QP context of connected QPs. Address resolution is done atomically in the
case of a link local address or using service from core/addr.c to do that. mlx4
transport packets were changed too to accomodate for RDMAoE.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/infiniband/hw/mlx4/ah.c  |  228 ++-
 drivers/infiniband/hw/mlx4/mlx4_ib.h |   30 -
 drivers/infiniband/hw/mlx4/qp.c  |  253 +++---
 include/linux/mlx4/device.h  |   31 -
 include/linux/mlx4/qp.h  |8 +-
 5 files changed, 463 insertions(+), 87 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c
index c75ac94..c994e1f 100644
--- a/drivers/infiniband/hw/mlx4/ah.c
+++ b/drivers/infiniband/hw/mlx4/ah.c
@@ -31,63 +31,200 @@
  */
 
 #include mlx4_ib.h
+#include rdma/ib_addr.h
+#include linux/inet.h
+#include linux/string.h
 
-struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
+static struct rdma_addr_client addr_client;
+
+struct resolve_ctx {
+   struct completion done;
+   int status;
+};
+
+
+static int status2err(int status)
 {
-   struct mlx4_dev *dev = to_mdev(pd-device)-dev;
-   struct mlx4_ib_ah *ah;
+   return status; /* TBD */
+}
 
-   ah = kmalloc(sizeof *ah, GFP_ATOMIC);
-   if (!ah)
-   return ERR_PTR(-ENOMEM);
+static void resolve_callback(int status, struct sockaddr *src_addr,
+struct rdma_dev_addr *addr, void *context)
+{
+   struct resolve_ctx *ctx = context;
 
-   memset(ah-av, 0, sizeof ah-av);
+   ctx-status = status;
+   complete(ctx-done);
+}
 
-   ah-av.port_pd = cpu_to_be32(to_mpd(pd)-pdn | (ah_attr-port_num  
24));
-   ah-av.g_slid  = ah_attr-src_path_bits;
-   ah-av.dlid= cpu_to_be16(ah_attr-dlid);
-   if (ah_attr-static_rate) {
-   ah-av.stat_rate = ah_attr-static_rate + MLX4_STAT_RATE_OFFSET;
-   while (ah-av.stat_rate  IB_RATE_2_5_GBPS + 
MLX4_STAT_RATE_OFFSET 
-  !(1  ah-av.stat_rate  dev-caps.stat_rate_support))
-   --ah-av.stat_rate;
+int mlx4_ib_resolve_grh(struct mlx4_ib_dev *dev, const struct ib_ah_attr 
*ah_attr,
+   u8 *mac, int *is_mcast)
+{
+   struct mlx4_ib_rdmaoe *rdmaoe = dev-rdmaoe;
+   struct sockaddr_in6 dst = {0};
+   struct rdma_dev_addr addr;
+   struct resolve_ctx ctx;
+   struct net_device *netdev;
+   int err = 0;
+   int ifidx;
+
+   *is_mcast = 0;
+   spin_lock(rdmaoe-lock);
+   netdev = rdmaoe-netdevs[ah_attr-port_num - 1];
+   if (!netdev) {
+   spin_unlock(rdmaoe-lock);
+   return -EINVAL;
+   }
+   ifidx = netdev-ifindex;
+   spin_unlock(rdmaoe-lock);
+
+   init_completion(ctx.done);
+   memcpy(dst.sin6_addr.s6_addr, ah_attr-grh.dgid.raw, sizeof 
ah_attr-grh);
+   dst.sin6_family = AF_INET6;
+   dst.sin6_scope_id = ifidx;
+   if (rdma_link_local_addr(dst.sin6_addr))
+   rdma_get_ll_mac(dst.sin6_addr, mac);
+   else if (rdma_is_multicast_addr(dst.sin6_addr)) {
+   rdma_get_mcast_mac(dst.sin6_addr, mac);
+   *is_mcast = 1;
+   } else {
+   err = rdma_resolve_ip(addr_client, NULL, (struct sockaddr 
*)dst, addr,
+ 2000, resolve_callback, ctx);
+   if (!err)
+   wait_for_completion(ctx.done);
+   else
+   ctx.status = err;
+
+   err = status2err(ctx.status);
}
-   ah-av.sl_tclass_flowlabel = cpu_to_be32(ah_attr-sl  28);
+
+   return err;
+}
+
+static struct ib_ah *create_ib_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr,
+ struct mlx4_ib_ah *ah)
+{
+   struct mlx4_dev *dev = to_mdev(pd-device)-dev;
+
+   ah-av.ib.port_pd = cpu_to_be32(to_mpd(pd)-pdn | (ah_attr-port_num  
24));
+   ah-av.ib.g_slid  = ah_attr-src_path_bits;
if (ah_attr-ah_flags  IB_AH_GRH) {
-   ah-av.g_slid   |= 0x80;
-   ah-av.gid_index = ah_attr-grh.sgid_index;
-   ah-av.hop_limit = ah_attr-grh.hop_limit;
-   ah-av.sl_tclass_flowlabel |=
+   ah-av.ib.g_slid   |= 0x80;
+   ah-av.ib.gid_index = ah_attr-grh.sgid_index;
+   ah-av.ib.hop_limit = ah_attr-grh.hop_limit;
+   ah-av.ib.sl_tclass_flowlabel |=
cpu_to_be32((ah_attr-grh.traffic_class  20) |
ah_attr-grh.flow_label);
-   memcpy(ah-av.dgid, ah_attr-grh.dgid.raw, 16);
+   memcpy(ah-av.ib.dgid, ah_attr-grh.dgid.raw, 16);
}
 
+   ah-av.ib.dlid

[ofa-general] [PATCH] libibverbs: Add RDMAoE support

2009-06-15 Thread Eli Cohen

Extend the ibv_query_port() verb to return enum ibv_port_link_type which
reports the link type to be either IB or Ethernet. This can be used by
applications to know if they must use GRH as is the case in RDMAoE.
Add a new system call to get the MAC address of the remote port that a UD
address vector refers to.
Update ibv_rc_pingpong and ibv_ud_pingpong to accept a remote GID so that they
can be used with an RDMAoE port.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 examples/devinfo.c|   13 ++
 examples/pingpong.c   |   31 +
 examples/pingpong.h   |3 ++
 examples/rc_pingpong.c|   50 
 examples/ud_pingpong.c|   37 +++---
 include/infiniband/driver.h   |1 +
 include/infiniband/kern-abi.h |   23 +-
 include/infiniband/verbs.h|7 +
 src/cmd.c |   20 
 src/libibverbs.map|1 +
 10 files changed, 170 insertions(+), 16 deletions(-)

diff --git a/examples/devinfo.c b/examples/devinfo.c
index caa5d5f..fc9dbb9 100644
--- a/examples/devinfo.c
+++ b/examples/devinfo.c
@@ -175,6 +175,18 @@ static int print_all_port_gids(struct ibv_context *ctx, 
uint8_t port_num, int tb
return rc;
 }
 
+static const char *link_type_str(enum ibv_port_link_type type)
+{
+   switch (type) {
+   case PORT_LINK_IB:
+   return PORT_LINK_IB;
+   case PORT_LINK_ETH:
+   return PORT_LINK_ETH;
+   default:
+   return Unknown;
+   }
+}
+
 static int print_hca_cap(struct ibv_device *ib_dev, uint8_t ib_port)
 {
struct ibv_context *ctx;
@@ -273,6 +285,7 @@ static int print_hca_cap(struct ibv_device *ib_dev, uint8_t 
ib_port)
printf(\t\t\tsm_lid:\t\t\t%d\n, port_attr.sm_lid);
printf(\t\t\tport_lid:\t\t%d\n, port_attr.lid);
printf(\t\t\tport_lmc:\t\t0x%02x\n, port_attr.lmc);
+   printf(\t\t\tlink_type:\t\t%s\n, 
link_type_str(port_attr.link_type));
 
if (verbose) {
printf(\t\t\tmax_msg_sz:\t\t0x%x\n, 
port_attr.max_msg_sz);
diff --git a/examples/pingpong.c b/examples/pingpong.c
index b916f59..e53e2fa 100644
--- a/examples/pingpong.c
+++ b/examples/pingpong.c
@@ -31,6 +31,8 @@
  */
 
 #include pingpong.h
+#include arpa/inet.h
+#include stdlib.h
 
 enum ibv_mtu pp_mtu_to_enum(int mtu)
 {
@@ -53,3 +55,32 @@ uint16_t pp_get_local_lid(struct ibv_context *context, int 
port)
 
return attr.lid;
 }
+
+int pp_get_port_info(struct ibv_context *context, int port,
+struct ibv_port_attr *attr)
+{
+   return ibv_query_port(context, port, attr);
+}
+
+void str2gid(char *grh, union ibv_gid *gid)
+{
+   char tmp;
+
+   tmp = grh[8];
+   grh[8] = 0;
+   gid-dwords[0] = htonl(strtoul(grh, NULL, 16));
+   grh[8] = tmp;
+
+   tmp = grh[16];
+   grh[16] = 0;
+   gid-dwords[1] = htonl(strtoul(grh + 8, NULL, 16));
+   grh[16] = tmp;
+
+   tmp = grh[24];
+   grh[24] = 0;
+   gid-dwords[2] = htonl(strtoul(grh + 16, NULL, 16));
+   grh[24] = tmp;
+
+   gid-dwords[3] = htonl(strtoul(grh + 24, NULL, 16));
+}
+
diff --git a/examples/pingpong.h b/examples/pingpong.h
index 71d7c3f..8c82b32 100644
--- a/examples/pingpong.h
+++ b/examples/pingpong.h
@@ -37,5 +37,8 @@
 
 enum ibv_mtu pp_mtu_to_enum(int mtu);
 uint16_t pp_get_local_lid(struct ibv_context *context, int port);
+int pp_get_port_info(struct ibv_context *context, int port,
+struct ibv_port_attr *attr);
+void str2gid(char *grh, union ibv_gid *gid);
 
 #endif /* IBV_PINGPONG_H */
diff --git a/examples/rc_pingpong.c b/examples/rc_pingpong.c
index 26fa45c..23cad7a 100644
--- a/examples/rc_pingpong.c
+++ b/examples/rc_pingpong.c
@@ -67,6 +67,8 @@ struct pingpong_context {
int  size;
int  rx_depth;
int  pending;
+   struct ibv_port_attr portinfo;
+   union ibv_giddgid;
 };
 
 struct pingpong_dest {
@@ -94,6 +96,12 @@ static int pp_connect_ctx(struct pingpong_context *ctx, int 
port, int my_psn,
.port_num   = port
}
};
+
+   if (ctx-dgid.global.interface_id) {
+   attr.ah_attr.is_global = 1;
+   attr.ah_attr.grh.hop_limit = 1; 
+   attr.ah_attr.grh.dgid = ctx-dgid;
+   }
if (ibv_modify_qp(ctx-qp, attr,
  IBV_QP_STATE  |
  IBV_QP_AV |
@@ -289,11 +297,11 @@ out:
 
 static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int 
size,
int rx_depth, int port,
-   int use_event)
+   int use_event, int is_server

[ofa-general] [PATCH] libmlx4: Add RDMAoE support

2009-06-15 Thread Eli Cohen

Modify mlx4_create_ah() to check the port link type and for Ethernet ports, do
a system call to retrieve the remote port's MAC address.
Add ConnectX EN 10GigE PCIe gen2  to the list of supported device.
Make modifications to address vector data structs and code to accomodate for
RDMAoE.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 src/mlx4.c  |1 +
 src/mlx4.h  |3 +++
 src/qp.c|2 ++
 src/verbs.c |   14 ++
 src/wqe.h   |3 ++-
 5 files changed, 22 insertions(+), 1 deletions(-)

diff --git a/src/mlx4.c b/src/mlx4.c
index 34ece39..d2e32fd 100644
--- a/src/mlx4.c
+++ b/src/mlx4.c
@@ -66,6 +66,7 @@ struct {
HCA(MELLANOX, 0x6354),  /* MT25408 Hermon QDR */
HCA(MELLANOX, 0x6732),  /* MT25408 Hermon DDR PCIe gen2 */
HCA(MELLANOX, 0x673c),  /* MT25408 Hermon QDR PCIe gen2 */
+   HCA(MELLANOX, 0x6750),  /* MT25408 Hermon EN 10GigE PCIe gen2 */
 };
 
 static struct ibv_context_ops mlx4_ctx_ops = {
diff --git a/src/mlx4.h b/src/mlx4.h
index 827a201..20d3fdd 100644
--- a/src/mlx4.h
+++ b/src/mlx4.h
@@ -236,11 +236,14 @@ struct mlx4_av {
uint8_t hop_limit;
uint32_tsl_tclass_flowlabel;
uint8_t dgid[16];
+   uint8_t mac[8];
 };
 
 struct mlx4_ah {
struct ibv_ah   ibv_ah;
struct mlx4_av  av;
+   uint16_tvlan;
+   uint8_t mac[6];
 };
 
 static inline unsigned long align(unsigned long val, unsigned long align)
diff --git a/src/qp.c b/src/qp.c
index d194ae3..cd8fab0 100644
--- a/src/qp.c
+++ b/src/qp.c
@@ -143,6 +143,8 @@ static void set_datagram_seg(struct mlx4_wqe_datagram_seg 
*dseg,
memcpy(dseg-av, to_mah(wr-wr.ud.ah)-av, sizeof (struct mlx4_av));
dseg-dqpn = htonl(wr-wr.ud.remote_qpn);
dseg-qkey = htonl(wr-wr.ud.remote_qkey);
+   dseg-vlan = htons(to_mah(wr-wr.ud.ah)-vlan);
+   memcpy(dseg-mac, to_mah(wr-wr.ud.ah)-mac, 6);
 }
 
 static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg)
diff --git a/src/verbs.c b/src/verbs.c
index cc179a0..ff59a93 100644
--- a/src/verbs.c
+++ b/src/verbs.c
@@ -617,6 +617,7 @@ int mlx4_destroy_qp(struct ibv_qp *ibqp)
 struct ibv_ah *mlx4_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr)
 {
struct mlx4_ah *ah;
+   struct ibv_port_attr port_attr;
 
ah = malloc(sizeof *ah);
if (!ah)
@@ -642,7 +643,20 @@ struct ibv_ah *mlx4_create_ah(struct ibv_pd *pd, struct 
ibv_ah_attr *attr)
memcpy(ah-av.dgid, attr-grh.dgid.raw, 16);
}
 
+   if (ibv_query_port(pd-context, attr-port_num, port_attr))
+   goto err;
+
+   if (port_attr.link_type == PORT_LINK_ETH) {
+   if (ibv_cmd_get_mac(pd, attr-port_num, ah-av.dgid, ah-mac))
+   goto err;
+   ah-vlan = 0;
+   }
+
+
return ah-ibv_ah;
+err:
+   free(ah);
+   return NULL;
 }
 
 int mlx4_destroy_ah(struct ibv_ah *ah)
diff --git a/src/wqe.h b/src/wqe.h
index 6f7f309..ea6f27f 100644
--- a/src/wqe.h
+++ b/src/wqe.h
@@ -78,7 +78,8 @@ struct mlx4_wqe_datagram_seg {
uint32_tav[8];
uint32_tdqpn;
uint32_tqkey;
-   uint32_treserved[2];
+   __be16  vlan;
+   uint8_t mac[6];
 };
 
 struct mlx4_wqe_data_seg {
-- 
1.6.3.1

___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Re: [ofa-general] Ipoib child interfaces are created in dgram mode, even if the config file (and main interface) specify connected mode.

2009-06-01 Thread Eli Cohen

On Mon, Jun 01, 2009 at 09:01:23AM -0500, Mike Heinz wrote:
 One of our testers recently reported that he was getting different 
 performance numbers from IPOIB depending on whether he was using interface 
 ib0 or  a child interface he had created with /sys/class/netib0/create_child.
 
 Investigating, I noticed that the two interfaces have different MTU sizes:
 
 ib0   Link encap:UNSPEC  HWaddr 
 80-00-00-02-FE-80-00-00-00-00-00-00-00-00-00-00
   inet addr:172.21.33.58  Bcast:172.21.255.255  Mask:255.255.0.0
   inet6 addr: fe80::211:7500:ff:8fa4/64 Scope:Link
   UP BROADCAST RUNNING MULTICAST  MTU:65520  Metric:1
   RX packets:76 errors:0 dropped:0 overruns:0 frame:0
   TX packets:23147668 errors:0 dropped:0 overruns:0 carrier:0
   collisions:0 txqueuelen:256
   RX bytes:5284 (5.1 Kb)  TX bytes:17365044664 (16560.5 Mb)
 
 ib0.9001  Link encap:UNSPEC  HWaddr 
 80-00-00-06-FE-80-00-00-00-00-00-00-00-00-00-00
   inet addr:172.21.34.58  Bcast:172.21.255.255  Mask:255.255.0.0
   inet6 addr: fe80::211:7500:ff:8fa4/64 Scope:Link
   UP BROADCAST RUNNING MULTICAST  MTU:2044  Metric:1
   RX packets:60413619 errors:0 dropped:0 overruns:0 frame:0
   TX packets:0 errors:0 dropped:0 overruns:0 carrier:0
   collisions:0 txqueuelen:256
   RX bytes:112572449188 (107357.4 Mb)  TX bytes:0 (0.0 b)
 
 
 This leads me to believe that the child interface is in datagram mode even 
 though the parent is in connected mode.
Probably it is, but you can always check the mode by reading the
corresponding sysfs file:
/sys/class/net/ib0.9001/mode
 
 Is this expected behavior?
Yes, it is the expected behaviour.

 Is there a way to change this?
 
Yes, it is possible

 --
 Michael Heinz
 Principal Engineer, Qlogic Corporation
 King of Prussia, Pennsylvania

 ___
 general mailing list
 general@lists.openfabrics.org
 http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general
 
 To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Re: [ofa-general] [RFC] ipoib: avoid using stale ipoib_neigh* in ipoib_neigh_cleanup()

2009-05-31 Thread Eli Cohen

On Sun, May 31, 2009 at 09:41:54AM +0300, Or Gerlitz wrote:
 akep...@sgi.com wrote @ 
 http://lists.openfabrics.org/pipermail/general/2009-May/059730.html
  What would prevent a race between a tx completion (with an 
  error) and the cleanup of a neighbour? 
 
 Okay, so maybe this code/design of using the stashed ipoib_neighbour at the tx
 completion code is the root cause of all these troubles?! 
 
 From a quick look on the code and two patches that touched this area 
 (f56bcd801... Use separate CQ for UD send completions and 57ce41d1... Fix 
 transmit queue stalling forever) - I see that the original tx cq handler - 
 ipoib_ib_handle_tx_wc() doesn't touch the neigbour but today is called only 
 from the drain timer  dev-stop flows. Now, ipoib_cm_handle_tx_wc() is 
 called for normal flow both for datagram and connected modes, and this 
 function touches he neighbour.

Or, I don't follow on you - ipoib_cm_handle_tx_wc() called
ipoib_neigh_free() from the first commit. Also please note the
following designation of CQs:
recv_cq: used for all receives and for CM send
send_cq: used for UD send

Thus, since in ipoib_poll() we poll recv_cq, any none receive must
be that of CM mode sends.

 
 I am not sure why commit f56bcd801... made UD completions to go through 
 ipoib_cm_handle_tx_wc() nor why this function must use the neighbor to access 
 the data-structure it needs to, maybe Eli can comment on that?
 
 Or.
 
 ___
 general mailing list
 general@lists.openfabrics.org
 http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general
 
 To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Re: [ofa-general] Re: [PATCH 2/3] libmlx4 - Optimize memory allocation of QP buffers with 64K pages

2009-05-20 Thread Eli Cohen

On Wed, May 20, 2009 at 08:00:47AM +0200, sebastien dugue wrote:
 
   Well not really, because if we stay below MMAP_THRESHOLD, as we do
 with 4K pages, the only overhead is malloc's chaining structure. The
 extra space used to align the buffer is released before posix_memalign()
 returns, but that does increase fragmentation of mallocs chunks.
 
   Also, for 4K pages, mmap() systematically results in a syscall whereas
 posix_memalign() does not necessarily, but as we're not on a fast path
 I'm not sure what would be best. I don't mind converting all QP buffers
 allocation to mmap(), but I'd like to hear what people think.
 

If the only reasoning behind using a MMAP_THRESHOLD is to avoid the
system call for smaller allocations, then I think we'd better use a
uniform allocation scheme -- mmap -- as you proposed and not
distinguish between the two cases.
___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

[ofa-general] [PATCH 1/2] mlx4_core: Use module parameter for number of MTTs per segment

2009-05-18 Thread Eli Cohen

The current MTTs allocator uses kmalloc to allocate a buffer for it's buddy
system implementation and thus is limited by the amount of MTT segments that it
can control. As a result, the size of memory that can be registered is limited
too. This patch uses a module parameter to control the number of MTT entries
that each segment represents, thus allowing to register more memory with the
same number of segments.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/net/mlx4/main.c |   14 --
 drivers/net/mlx4/mr.c   |6 +++---
 drivers/net/mlx4/profile.c  |2 +-
 include/linux/mlx4/device.h |1 +
 4 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c
index 30bea96..018348c 100644
--- a/drivers/net/mlx4/main.c
+++ b/drivers/net/mlx4/main.c
@@ -100,6 +100,10 @@ module_param_named(use_prio, use_prio, bool, 0444);
 MODULE_PARM_DESC(use_prio, Enable steering by VLAN priority on ETH ports 
  (0/1, default 0));
 
+static int log_mtts_per_seg = ilog2(MLX4_MTT_ENTRY_PER_SEG);
+module_param_named(log_mtts_per_seg, log_mtts_per_seg, int, 0444);
+MODULE_PARM_DESC(log_mtts_per_seg, Log2 number of MTT entries per segment 
(1-5));
+
 int mlx4_check_port_params(struct mlx4_dev *dev,
   enum mlx4_port_type *port_type)
 {
@@ -203,12 +207,13 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct 
mlx4_dev_cap *dev_cap)
dev-caps.max_cqes   = dev_cap-max_cq_sz - 1;
dev-caps.reserved_cqs   = dev_cap-reserved_cqs;
dev-caps.reserved_eqs   = dev_cap-reserved_eqs;
+   dev-caps.mtts_per_seg   = 1  log_mtts_per_seg;
dev-caps.reserved_mtts  = DIV_ROUND_UP(dev_cap-reserved_mtts,
-   MLX4_MTT_ENTRY_PER_SEG);
+   dev-caps.mtts_per_seg);
dev-caps.reserved_mrws  = dev_cap-reserved_mrws;
dev-caps.reserved_uars  = dev_cap-reserved_uars;
dev-caps.reserved_pds   = dev_cap-reserved_pds;
-   dev-caps.mtt_entry_sz   = MLX4_MTT_ENTRY_PER_SEG * 
dev_cap-mtt_entry_sz;
+   dev-caps.mtt_entry_sz   = dev-caps.mtts_per_seg * 
dev_cap-mtt_entry_sz;
dev-caps.max_msg_sz = dev_cap-max_msg_sz;
dev-caps.page_size_cap  = ~(u32) (dev_cap-min_page_sz - 1);
dev-caps.flags  = dev_cap-flags;
@@ -1304,6 +1309,11 @@ static int __init mlx4_verify_params(void)
return -1;
}
 
+   if ((log_mtts_per_seg  1) || (log_mtts_per_seg  5)) {
+   printk(KERN_WARNING mlx4_core: bad log_mtts_per_seg: %d\n, 
log_mtts_per_seg);
+   return -1;
+   }
+
return 0;
 }
 
diff --git a/drivers/net/mlx4/mr.c b/drivers/net/mlx4/mr.c
index 0caf74c..3b8973d 100644
--- a/drivers/net/mlx4/mr.c
+++ b/drivers/net/mlx4/mr.c
@@ -209,7 +209,7 @@ int mlx4_mtt_init(struct mlx4_dev *dev, int npages, int 
page_shift,
} else
mtt-page_shift = page_shift;
 
-   for (mtt-order = 0, i = MLX4_MTT_ENTRY_PER_SEG; i  npages; i = 1)
+   for (mtt-order = 0, i = dev-caps.mtts_per_seg; i  npages; i = 1)
++mtt-order;
 
mtt-first_seg = mlx4_alloc_mtt_range(dev, mtt-order);
@@ -350,7 +350,7 @@ int mlx4_mr_enable(struct mlx4_dev *dev, struct mlx4_mr *mr)
mpt_entry-pd_flags |= cpu_to_be32(MLX4_MPT_PD_FLAG_FAST_REG |
   MLX4_MPT_PD_FLAG_RAE);
mpt_entry-mtt_sz= cpu_to_be32((1  mr-mtt.order) *
-  MLX4_MTT_ENTRY_PER_SEG);
+  dev-caps.mtts_per_seg);
} else {
mpt_entry-flags|= cpu_to_be32(MLX4_MPT_FLAG_SW_OWNS);
}
@@ -391,7 +391,7 @@ static int mlx4_write_mtt_chunk(struct mlx4_dev *dev, 
struct mlx4_mtt *mtt,
(start_index + npages - 1) / (PAGE_SIZE / sizeof (u64)))
return -EINVAL;
 
-   if (start_index  (MLX4_MTT_ENTRY_PER_SEG - 1))
+   if (start_index  (dev-caps.mtts_per_seg - 1))
return -EINVAL;
 
mtts = mlx4_table_find(priv-mr_table.mtt_table, mtt-first_seg +
diff --git a/drivers/net/mlx4/profile.c b/drivers/net/mlx4/profile.c
index cebdf32..bd22df9 100644
--- a/drivers/net/mlx4/profile.c
+++ b/drivers/net/mlx4/profile.c
@@ -98,7 +98,7 @@ u64 mlx4_make_profile(struct mlx4_dev *dev,
profile[MLX4_RES_EQ].size = dev_cap-eqc_entry_sz;
profile[MLX4_RES_DMPT].size   = dev_cap-dmpt_entry_sz;
profile[MLX4_RES_CMPT].size   = dev_cap-cmpt_entry_sz;
-   profile[MLX4_RES_MTT].size= MLX4_MTT_ENTRY_PER_SEG * 
dev_cap-mtt_entry_sz;
+   profile[MLX4_RES_MTT].size= dev-caps.mtts_per_seg * 
dev_cap-mtt_entry_sz;
profile[MLX4_RES_MCG].size= MLX4_MGM_ENTRY_SIZE;
 
profile[MLX4_RES_QP].num  = request-num_qp

[ofa-general] [PATCH 2/2] ib_mthca: Use module parameter for number of MTTs per segment

2009-05-18 Thread Eli Cohen

The current MTTs allocator uses kmalloc to allocate a buffer for it's buddy
system implementation and thus is limited by the amount of MTT segments that it
can control. As a result, the size of memory that can be registered is limited
too. This patch uses a module parameter to control the number of MTT entries
that each segment represents, thus allowing to register more memory with the
same number of segments.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/infiniband/hw/mthca/mthca_cmd.c |2 +-
 drivers/infiniband/hw/mthca/mthca_dev.h |1 +
 drivers/infiniband/hw/mthca/mthca_main.c|   17 ++---
 drivers/infiniband/hw/mthca/mthca_mr.c  |   16 
 drivers/infiniband/hw/mthca/mthca_profile.c |4 ++--
 5 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.c 
b/drivers/infiniband/hw/mthca/mthca_cmd.c
index 6d55f9d..8c2ed99 100644
--- a/drivers/infiniband/hw/mthca/mthca_cmd.c
+++ b/drivers/infiniband/hw/mthca/mthca_cmd.c
@@ -1059,7 +1059,7 @@ int mthca_QUERY_DEV_LIM(struct mthca_dev *dev,
MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_MTT_OFFSET);
if (mthca_is_memfree(dev))
dev_lim-reserved_mtts = ALIGN((1  (field  4)) * 
sizeof(u64),
-  MTHCA_MTT_SEG_SIZE) / 
MTHCA_MTT_SEG_SIZE;
+  dev-limits.mtt_seg_size) / 
dev-limits.mtt_seg_size;
else
dev_lim-reserved_mtts = 1  (field  4);
MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_MRW_SZ_OFFSET);
diff --git a/drivers/infiniband/hw/mthca/mthca_dev.h 
b/drivers/infiniband/hw/mthca/mthca_dev.h
index 2525901..9ef611f 100644
--- a/drivers/infiniband/hw/mthca/mthca_dev.h
+++ b/drivers/infiniband/hw/mthca/mthca_dev.h
@@ -159,6 +159,7 @@ struct mthca_limits {
int  reserved_eqs;
int  num_mpts;
int  num_mtt_segs;
+   int  mtt_seg_size;
int  fmr_reserved_mtts;
int  reserved_mtts;
int  reserved_mrws;
diff --git a/drivers/infiniband/hw/mthca/mthca_main.c 
b/drivers/infiniband/hw/mthca/mthca_main.c
index 1d83cf7..13da9f1 100644
--- a/drivers/infiniband/hw/mthca/mthca_main.c
+++ b/drivers/infiniband/hw/mthca/mthca_main.c
@@ -125,6 +125,10 @@ module_param_named(fmr_reserved_mtts, 
hca_profile.fmr_reserved_mtts, int, 0444);
 MODULE_PARM_DESC(fmr_reserved_mtts,
 number of memory translation table segments reserved for 
FMR);
 
+static int log_mtts_per_seg = ilog2(MTHCA_MTT_SEG_SIZE / 8);
+module_param_named(log_mtts_per_seg, log_mtts_per_seg, int, 0444);
+MODULE_PARM_DESC(log_mtts_per_seg, Log2 number of MTT entries per segment 
(1-5));
+
 static char mthca_version[] __devinitdata =
DRV_NAME : Mellanox InfiniBand HCA driver v
DRV_VERSION  ( DRV_RELDATE )\n;
@@ -162,6 +166,7 @@ static int mthca_dev_lim(struct mthca_dev *mdev, struct 
mthca_dev_lim *dev_lim)
int err;
u8 status;
 
+   mdev-limits.mtt_seg_size = (1  log_mtts_per_seg) * 8;
err = mthca_QUERY_DEV_LIM(mdev, dev_lim, status);
if (err) {
mthca_err(mdev, QUERY_DEV_LIM command failed, aborting.\n);
@@ -460,11 +465,11 @@ static int mthca_init_icm(struct mthca_dev *mdev,
}
 
/* CPU writes to non-reserved MTTs, while HCA might DMA to reserved 
mtts */
-   mdev-limits.reserved_mtts = ALIGN(mdev-limits.reserved_mtts * 
MTHCA_MTT_SEG_SIZE,
-  dma_get_cache_alignment()) / 
MTHCA_MTT_SEG_SIZE;
+   mdev-limits.reserved_mtts = ALIGN(mdev-limits.reserved_mtts * 
mdev-limits.mtt_seg_size,
+  dma_get_cache_alignment()) / 
mdev-limits.mtt_seg_size;
 
mdev-mr_table.mtt_table = mthca_alloc_icm_table(mdev, 
init_hca-mtt_base,
-MTHCA_MTT_SEG_SIZE,
+
mdev-limits.mtt_seg_size,
 
mdev-limits.num_mtt_segs,
 
mdev-limits.reserved_mtts,
 1, 0);
@@ -1315,6 +1320,12 @@ static void __init mthca_validate_profile(void)
printk(KERN_WARNING PFX Corrected fmr_reserved_mtts to %d.\n,
   hca_profile.fmr_reserved_mtts);
}
+
+   if ((log_mtts_per_seg  1) || (log_mtts_per_seg  5)) {
+   printk(KERN_WARNING PFX bad log_mtts_per_seg (%d). Using 
default - %d\n,
+  log_mtts_per_seg, ilog2(MTHCA_MTT_SEG_SIZE / 8));
+   log_mtts_per_seg = ilog2(MTHCA_MTT_SEG_SIZE / 8);
+   }
 }
 
 static int __init mthca_init(void)
diff --git a/drivers/infiniband/hw/mthca/mthca_mr.c 
b/drivers/infiniband/hw/mthca/mthca_mr.c
index 882e6b7..d606edf 100644
--- a/drivers/infiniband/hw/mthca/mthca_mr.c

Re: [ofa-general] OFED1.3.1: soft lockup in completion handler

2009-04-21 Thread Eli Cohen

On Tue, Apr 21, 2009 at 11:08:25AM +0800, Liang Zhen wrote:
 In our module, we have per-CPU thread and per-CPU waitq, each thread has
 it's own connection list to poll on, completion handler will dispatch
 connection to it's scheduler and wake the scheduler.
 I'm very sure that scheduler doesn't have any heavy  slow operation
 with holding ibs_lock, and I never got this problem on system with = 8
 cores.
 Looks like completion handler is always run on the same core, so
 completion handler race with all other cores on their own waitq and very
 likely to get soft lockup...

Where is the other place that you acquire conn-ibc_sched-ibs_lock?
Is it in the per CPU thread? Maybe you should try to decrease the time
when the lock is acquired at the thread. Can you send all references
to the code aquiring the lock?


 I've tried to turn off irqbalancer and set /proc/irq/.../smp_affinity
 for more cores, but changed nothing and still soft lockup.
 
 After I installed ofed1.4.1 and create CQ with
 ib_create_cq(comp_vector), the problem is gone and get really good
 performance. The problem now is, seems ofed1.4.1:mlx4 is the only driver
 can really support multiple completion vectors, but we can't expect all
 customers to have the same environment...
 Is there only other possible way to resolve this?
 
 Thanks
 Liang
 ___
 general mailing list
 general@lists.openfabrics.org
 http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general
 
 To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

[ofa-general] Re: [PATCH v3] mlx4_ib: Optimize hugetlab pages support

2009-04-19 Thread Eli Cohen

On Mon, Apr 13, 2009 at 10:19:08PM +0300, Yossi Etigin wrote:

I see. That surprises me... could you also print vma-vm_start and
vma-vm_end? I would have expected the vma to be streched to fill a
full huge page and return the number of regular pages fitting in it.

 Eli Cohen wrote:
  On Mon, Apr 06, 2009 at 08:49:43PM +0300, Yossi Etigin wrote:
  I don't understand - if all area is huge pages, it does not mean that
  it fills full huge pages - I can have just 4096 bytes in huge page memory 
  and umem-hugetlb will remain 1, right?
  
  You may call ib_umem_get() with a fraction of a huge page but I expect
  the number of pages returned from get_user_pages() will fill up a huge
  page. Can you check that with the mckey test you were using?
 
 The number of pages is 1.
 I got this in dmesg with the modified mckey (see the last line):
 
 umem: addr=508000 size=1024 hugetlb=0 npages=1
 umem: addr=50a000 size=4096 hugetlb=0 npages=1
 umem: addr=50c000 size=4352 hugetlb=0 npages=2
 umem: addr=50f000 size=4096 hugetlb=0 npages=1
 umem: addr=2ac0 size=140 hugetlb=1 npages=1
 
 
 After applying this to umem.c:
 
 --- ofa_kernel-1.4.1/drivers/infiniband/core/umem.c   2009-04-13 
 22:15:19.0 +0300
 +++ ofa_kernel-1.4.1.patched/drivers/infiniband/core/umem.c   2009-04-13 
 22:09:36.0 +0300
 @@ -137,6 +137,7 @@
   int ret;
   int off;
   int i;
 + int ntotalpages;
   DEFINE_DMA_ATTRS(attrs);
  
   if (dmasync)
 @@ -196,6 +197,7 @@
   cur_base = addr  PAGE_MASK;
  
   ret = 0;
 + ntotalpages = 0;
   while (npages) {
   ret = get_user_pages(current, current-mm, cur_base,
min_t(unsigned long, npages,
 @@ -226,6 +228,7 @@
   !is_vm_hugetlb_page(vma_list[i + off]))
   umem-hugetlb = 0;
   sg_set_page(chunk-page_list[i], page_list[i + 
 off], PAGE_SIZE, 0);
 + ntotalpages++;
   }
  
   chunk-nmap = ib_dma_map_sg_attrs(context-device,
 @@ -254,8 +257,11 @@
   if (ret  0) {
   __ib_umem_release(context-device, umem, 0);
   kfree(umem);
 - } else
 + } else {
   current-mm-locked_vm = locked;
 + printk(KERN_DEBUG umem: addr=%lx size=%ld hugetlb=%d 
 npages=%d\n,
 +addr, size, umem-hugetlb, ntotalpages);
 + }
  
   up_write(current-mm-mmap_sem);
   if (vma_list)
___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

[ofa-general] Re: [PATCH v3] mlx4_ib: Optimize hugetlab pages support

2009-04-07 Thread Eli Cohen

On Mon, Apr 06, 2009 at 08:49:43PM +0300, Yossi Etigin wrote:
 
 I don't understand - if all area is huge pages, it does not mean that
 it fills full huge pages - I can have just 4096 bytes in huge page memory 
 and umem-hugetlb will remain 1, right?

You may call ib_umem_get() with a fraction of a huge page but I expect
the number of pages returned from get_user_pages() will fill up a huge
page. Can you check that with the mckey test you were using?
___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

[ofa-general] Re: [PATCH v3] mlx4_ib: Optimize hugetlab pages support

2009-04-05 Thread Eli Cohen

On Thu, Apr 02, 2009 at 11:31:42PM +0300, Yossi Etigin wrote:
 Hi Eli,

 I've placed a printk in the new hugetlb function to print n and j.
 While running the mckey test (attached in bugzilla), I got j=0, n=1.

 Why do you say that the number of pages must cover HUGE_PAGES?
 In ib_umem_get, hugetlb is set to 0 only if any of the pages is  
 not-hugetlb - otherwise it's 1. Am I missing something?


Only if ALL of the registered area is huge pages, then umem-hugetlb
will remain 1; otherwise it is cleared and we don't execute
handle_hugetlb_user_mr(). In case we do execute
handle_hugetlb_user_mr(), I expect the number of PAGE_SIZEed pages to
fill full huge pages.
___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

[ofa-general] Re: [ewg] Re: [PATCH v3] mlx4_ib: Optimize hugetlab pages support

2009-04-02 Thread Eli Cohen

On Mon, Mar 30, 2009 at 06:49:37PM +0300, Yossi Etigin wrote:

Yossi,

I wouldn't expect this to matter since handle_hugetlb_user_mr() only
gets called when the memory is huge pages which means the number of
PAGE_SIZE pages cover full HUGE_PAGES.  You mention in Bugzilla an
mckey test but I don't know this test. Can you send how to obatain
the test and instructions how to build it and run it?

 +
 + if (cur_size) {
 + arr[j++] = cur_addr;
 + }
 +
 http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg
___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

[ofa-general] Re: [ewg] Re: [PATCH v3] mlx4_ib: Optimize hugetlab pages support

2009-04-02 Thread Eli Cohen

On Thu, Apr 02, 2009 at 11:47:26AM +0300, Or Gerlitz wrote:
 mckey is installed with librdmac-utils, has man page, etc. Its source is  
 under the examples directory of the librdmacm src tree, you can clone  
 Sean's librdmacm git from ofa to get it - OK?


Thanks Or. I will check this.
___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Re: [ofa-general] Re: [ewg] Re: [PATCH] IB/ipoib: set neigh-dgid upon ipoib_neigh creation

2009-03-29 Thread Eli Cohen

On Sun, Mar 29, 2009 at 12:07:09PM +0300, Moni Shoua wrote:
 Eli Cohen wrote:
  On Wed, Mar 25, 2009 at 09:20:19PM +0200, Yossi Etigin wrote:
  Wouldn't this patch break the fast bond+SM failover recovery? 
 
  Up till now, in such case neigh-dgid was the old gid until path record  
  query was successful, and that caused to retry the path query until it's  
  successful. With that patch, a new path query will not be issued until  
  the next arp refresh because neigh-dgid will be set to a valid value  
  right after the first packet.
  Looks to me like 
  
  How about keeping the memcpy in path_rec-completion, and also adding it  
  to ib_mcast_send (where ipoib neigh is created)?
 
  I did not like the idea that a unicast ipoib_neigh can be created and
  destroyed till the path is resolved so I preferred to initialize at
  creation time. How about the following patch to solve the problem of
  failure to resolve a path:
  
  
 From 4e6e3dd0186803f7547f5e4c233d7525dfcdaec6 Mon Sep 17 00:00:00 2001
  From: Eli Cohen e...@mellanox.co.il
  Date: Wed, 25 Mar 2009 16:22:28 +0200
  Subject: [PATCH] IB/ipoib: set neigh-dgid upon ipoib_neigh creation
  
  If we fail to do that, there can be superfluous calls to
  ipoib_neigh_free()/ipoib_neigh_alloc() due to the following if statement in
  ipoib_start_xmit():
  
  if (unlikely((memcmp(neigh-dgid.raw,
   skb-dst-neighbour-ha + 4,
   sizeof(union ib_gid))) ||
  
  In the case of a unicast GID, it can happen until the path is resolved and 
  is
  not so severe. In the case of a multicast address, neigh-dgid is never even
  updated, so the corresponding ipoib_neigh will be destroyed and created on
  every multicast packet sent.
  
  Signed-off-by: Eli Cohen e...@mellanox.co.il
  ---
   drivers/infiniband/ulp/ipoib/ipoib_main.c |7 ---
   1 files changed, 4 insertions(+), 3 deletions(-)
  
  diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c 
  b/drivers/infiniband/ulp/ipoib/ipoib_main.c
  index 0bd2a4f..b680483 100644
  --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
  +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
  @@ -460,8 +460,6 @@ static void path_rec_completion(int status,
  }
  kref_get(path-ah-ref);
  neigh-ah = path-ah;
  -   memcpy(neigh-dgid.raw, path-pathrec.dgid.raw,
  -  sizeof(union ib_gid));
   
  if (ipoib_cm_enabled(dev, neigh-neighbour)) {
  if (!ipoib_cm_get(neigh))
  @@ -481,7 +479,9 @@ static void path_rec_completion(int status,
  __skb_queue_tail(skqueue, skb);
  }
  path-valid = 1;
  -   }
  +   } else
  +   list_for_each_entry_safe(neigh, tn, path-neigh_list, list)
  +   memset(neigh-dgid, 0, sizeof neigh-dgid);
   
  path-query = NULL;
  complete(path-done);
  @@ -878,6 +878,7 @@ struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour 
  *neighbour,
  neigh-neighbour = neighbour;
  neigh-dev = dev;
  *to_ipoib_neigh(neighbour) = neigh;
  +   memcpy(neigh-dgid.raw, neighbour-ha + 4, 16);
  skb_queue_head_init(neigh-queue);
  ipoib_cm_set(neigh, NULL);
   
 
 This might work but still, I don't understand what this patch fixes. What was 
 the initial problem? 

The problem was that for multicast sends, the corresponding
ipoib_neigh would be created and destroyed for every
packet because this if will fail:


if (unlikely((memcmp(neigh-dgid.raw,
  skb-dst-neighbour-ha + 4,
  sizeof(union ib_gid))) ||

There is a less severe problem with unicast flow where the ipoib_neigh
object might be created and destroyed until the path is resolved.


 I agree with Yossi's early comments that the patch breaks the fast fail over 
 we've worked on in the past.
___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

[ofa-general] Re: [ewg] Re: [PATCH] IB/ipoib: set neigh-dgid upon ipoib_neigh creation

2009-03-26 Thread Eli Cohen

On Wed, Mar 25, 2009 at 09:20:19PM +0200, Yossi Etigin wrote:
 Wouldn't this patch break the fast bond+SM failover recovery? 

 Up till now, in such case neigh-dgid was the old gid until path record  
 query was successful, and that caused to retry the path query until it's  
 successful. With that patch, a new path query will not be issued until  
 the next arp refresh because neigh-dgid will be set to a valid value  
 right after the first packet.
Looks to me like 

 How about keeping the memcpy in path_rec-completion, and also adding it  
 to ib_mcast_send (where ipoib neigh is created)?

I did not like the idea that a unicast ipoib_neigh can be created and
destroyed till the path is resolved so I preferred to initialize at
creation time. How about the following patch to solve the problem of
failure to resolve a path:


From 4e6e3dd0186803f7547f5e4c233d7525dfcdaec6 Mon Sep 17 00:00:00 2001
From: Eli Cohen e...@mellanox.co.il
Date: Wed, 25 Mar 2009 16:22:28 +0200
Subject: [PATCH] IB/ipoib: set neigh-dgid upon ipoib_neigh creation

If we fail to do that, there can be superfluous calls to
ipoib_neigh_free()/ipoib_neigh_alloc() due to the following if statement in
ipoib_start_xmit():

if (unlikely((memcmp(neigh-dgid.raw,
 skb-dst-neighbour-ha + 4,
 sizeof(union ib_gid))) ||

In the case of a unicast GID, it can happen until the path is resolved and is
not so severe. In the case of a multicast address, neigh-dgid is never even
updated, so the corresponding ipoib_neigh will be destroyed and created on
every multicast packet sent.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/infiniband/ulp/ipoib/ipoib_main.c |7 ---
 1 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c 
b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 0bd2a4f..b680483 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -460,8 +460,6 @@ static void path_rec_completion(int status,
}
kref_get(path-ah-ref);
neigh-ah = path-ah;
-   memcpy(neigh-dgid.raw, path-pathrec.dgid.raw,
-  sizeof(union ib_gid));
 
if (ipoib_cm_enabled(dev, neigh-neighbour)) {
if (!ipoib_cm_get(neigh))
@@ -481,7 +479,9 @@ static void path_rec_completion(int status,
__skb_queue_tail(skqueue, skb);
}
path-valid = 1;
-   }
+   } else
+   list_for_each_entry_safe(neigh, tn, path-neigh_list, list)
+   memset(neigh-dgid, 0, sizeof neigh-dgid);
 
path-query = NULL;
complete(path-done);
@@ -878,6 +878,7 @@ struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour 
*neighbour,
neigh-neighbour = neighbour;
neigh-dev = dev;
*to_ipoib_neigh(neighbour) = neigh;
+   memcpy(neigh-dgid.raw, neighbour-ha + 4, 16);
skb_queue_head_init(neigh-queue);
ipoib_cm_set(neigh, NULL);
 
-- 
1.6.2.1

___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

[ofa-general] resolving ipv6 addresses - help needed

2009-03-05 Thread Eli Cohen

Hi,

I am trying to resolve IPv6 addresses (that is, obtaining the MAC
address of the interface bearing the IP addresses). I am using
rdma_resolve_ip() defined in core/addr.c but I don't get consistent
results.
I get something like this:

fe80::202:c9ff:fe00:1341 dev eth1  FAILED

where I would expect to get address to be rachable through eth2.

If I run:
ping6 -I eth2 fe80::202:c9ff:fe00:1341

then I'd get the address resolved but after some time, when I attempt
to resolve again, the operation fails.

Is this API tested and reliable? Maybe I'm doing anything wrong...

Thanks,
Eli
___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Re: [ofa-general] resolving ipv6 addresses - help needed

2009-03-05 Thread Eli Cohen

On Thu, Mar 05, 2009 at 09:29:48AM -0800, Sean Hefty wrote:
 
 I didn't quite follow this last sentence.  Are you saying that it only works
 sometimes?  Only the first time?
After I run ping6 from the command line, the remote ipv6 address gets
added to the cache and resolution starts working. After some time
(about 2 sec or so), I try to resolve the same address again but the
operation fails.

 
 What kernel version are you running?
2.6.29-rc6
___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

[ofa-general] Too many calls to mlx4_CLOSE_PORT()?

2009-02-23 Thread Eli Cohen

Roland,

browsing the code, I see that mlx4_CLOSE_PORT() gets called from,
seemingly, too many places. I would expect it to get called only from
__mlx4_ib_modify_qp() when QP0 gets closed, but mlx4_ib_remove() calls
it too even though it is soon to be called by __mlx4_ib_modify_qp()
due to destroying the MAD QP. It also gets called from
mlx4_remove_one() even though by the time this function gets called,
the port is already closed. Is there a reason for that?
___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

[ofa-general] Re: [ewg] iscsi initiator ipoib+lro crash on upstream kernel

2009-02-21 Thread Eli Cohen

Thanks!

On Thu, Feb 19, 2009 at 9:40 PM, Or Gerlitz or.gerl...@gmail.com wrote:
 On Thu, Feb 19, 2009 at 6:55 PM, Eli Cohen e...@dev.mellanox.co.il wrote:

 I have encountered a kernel crash when running a iSCSI initiator on
 IPoIB configured with LRO (if LRO is off it does not happen). This
 was seen first on Sles10sp2 but then I verified it happens on 2.6.28.2 too.

 Eli,

 This is a known issue
 (http://bugzilla.kernel.org/show_bug.cgi?id=11804) a fix was submitted
 upstream and would be included in the next kernel.

 Or.

___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

[ofa-general] iscsi initiator ipoib+lro crash on upstream kernel

2009-02-19 Thread Eli Cohen

Hi,

I have encountered a kernel crash when running a iSCSI initiator on
IPoIB configured with LRO (if LRO is off it does not happen). This
was seen first on Sles10sp2 but then I verified it happens on 2.6.28.2
too. Bellow is a dump of the crash info from 2.6.28.2:

sd 2:0:0:1: Attached scsi generic sg3 type 0
BUG: unable to handle kernel NULL pointer dereference at 0004
IP: [803c50a4] skb_seq_read+0xfb/0x1a1
PGD 227115067 PUD 0 
Oops:  [#1] SMP 
last sysfs file: /sys/devices/platform/host2/session2/target2:0:0/2:0:0:1/type
CPU 2 
Modules linked in: ib_uverbs ib_umad mlx4_ib nfs lockd nfs_acl mlx4_core sunrpc 
ib_mthca ib_ipoib ib_cm ib_sa ib_mad ib_core inet_lro ipv6 button battery a]
Pid: 0, comm: swapper Not tainted 2.6.28.2-debug #3
RIP: 0010:[803c50a4]  [803c50a4] skb_seq_read+0xfb/0x1a1
RSP: 0018:88022f0e3b00  EFLAGS: 00010246
RAX: 88022dd44f38 RBX: 88022f0e3b30 RCX: 
RDX:  RSI: 88022f0e3b88 RDI: 07d4
RBP: 07d4 R08: 880220476d30 R09: 085c
R10: 000b0038 R11: a0126115 R12: 88022f0e3b88
R13: 88022d974d38 R14: 07d4 R15: 07d4
FS:  () GS:88022f07bb50() knlGS:
CS:  0010 DS: 0018 ES: 0018 CR0: 8005003b
CR2: 0004 CR3: 0002271c2000 CR4: 06e0
DR0:  DR1:  DR2: 
DR3:  DR6: 0ff0 DR7: 0400
Process swapper (pid: 0, threadinfo 88022f0da000, task 88022f0a4050)
Stack:
 88022d974fa0 88022f0e3b30 07d4 a01261fe
 88022d974f80 880220418068 085c 07d4
 880220476d30 88022dd44f38  88022dd44e58
Call Trace:
 IRQ 0 [a01261fe] ? iscsi_tcp_recv+0x64/0x39b [iscsi_tcp]
 [803f0d0f] ? ip_queue_xmit+0x2aa/0x2fd
 [803f60fd] ? tcp_read_sock+0x97/0x212
 [a012619a] ? iscsi_tcp_recv+0x0/0x39b [iscsi_tcp]
 [a012615d] ? iscsi_tcp_data_ready+0x48/0x85 [iscsi_tcp]
 [803ff119] ? tcp_rcv_established+0x4c0/0x567
 [804042f8] ? tcp_v4_do_rcv+0x2c/0x1c8
 [80405fb9] ? tcp_v4_rcv+0x630/0x683
 [803c6552] ? skb_release_head_state+0x60/0x8f
 [803ecb9f] ? ip_local_deliver_finish+0xda/0x197
 [803ecaab] ? ip_rcv_finish+0x32f/0x349
 [a024e42d] ? lro_flush+0x159/0x17e [inet_lro]
 [a024eb2e] ? __lro_proc_skb+0x1ca/0x1ed [inet_lro]
 [80221e28] ? swiotlb_map_single_phys+0x0/0x12
 [a024eb69] ? lro_receive_skb+0x18/0x3e [inet_lro]
 [a0299582] ? ipoib_ib_handle_rx_wc+0x1ed/0x22b [ib_ipoib]
 [a0299e97] ? ipoib_poll+0x9c/0x173 [ib_ipoib]
 [803ce1d0] ? net_rx_action+0x9d/0x175
 [80239ffb] ? __do_softirq+0x7a/0x13d
 [8020cf4c] ? call_softirq+0x1c/0x28
 [8020df5d] ? do_softirq+0x2c/0x68
 [8020e05b] ? do_IRQ+0xc2/0xdf
 [8020c206] ? ret_from_intr+0x0/0xa
 EOI 0 [80212464] ? mwait_idle+0x41/0x44
 [8020abca] ? cpu_idle+0x40/0x5e
Code: ff 88 48 e0 ff ff 48 c7 43 20 00 00 00 00 ff 43 08 8b 46 0c 01 43 0c 48 
8b 43 18 8b 4b 08 8b 90 b4 00 00 00 48 03 90 b8 00 00 00 0f b7 42 04 39 c1  
RIP  [803c50a4] skb_seq_read+0xfb/0x1a1
 RSP 88022f0e3b00
CR2: 0004
Kernel panic - not syncing: Fatal exception in interrupt

When I looked at this on sles10 I was able to verify that the problem was with
(see bellow where this comes from) st-cur_skb-next equals 0x:

 if (st-cur_skb-next) {
st-cur_skb = st-cur_skb-next;  === this where I see the 
problem
st-frag_idx = 0;
goto next_skb;
} else if (st-root_skb == st-cur_skb 


___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Re: [ofa-general] Re: [ewg] [PATCH] ib_core: save process's virtual address in struct ib_umem

2009-01-29 Thread Eli Cohen

On Mon, Jan 26, 2009 at 10:26:25AM -0800, Roland Dreier wrote:
   It is has to be saved either at the low level driver's mr object,
   e.g. struct mlx4_ib_mr, or at a common place like struct ib_umem. Do
   you prefer that it will be saved in struct mlx4_ib_mr?
 
 I don't see why it has to be saved anywhere?  The only place you use
 umem-address is in handle_hugetlb_usermr(), and you could just as
 easily pass in start directly as a parameter (since
 mlx4_ib_reg_user_mr() has that value in a parameter anyway).
 

Sorry for the delayed response.
I see... you're right - no need to stick the address into struct
ib_umem. Following this email is a new patch for mlx4_ib only. I
excluded support for both powerpc and ia64 since I could not find a
way to get HPAGE_SIZE (or HPAGE_SHIFT) for them.
___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Re: [ofa-general] Re: [ewg] [PATCH] ib_core: save process's virtual address in struct ib_umem

2009-01-29 Thread Eli Cohen

On Thu, Jan 29, 2009 at 07:33:22AM -0800, Roland Dreier wrote:
   I see... you're right - no need to stick the address into struct
   ib_umem. Following this email is a new patch for mlx4_ib only. I
   excluded support for both powerpc and ia64 since I could not find a
   way to get HPAGE_SIZE (or HPAGE_SHIFT) for them.
 
 #include asm/page.h ?
 
It does not help. The problem with powerpc is that HPAGE_SHIFT is an
unexported variable and for ia64 it's hpage_shift.
___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

[ofa-general] Re: [ewg] [PATCH] ib_core: save process's virtual address in struct ib_umem

2009-01-26 Thread Eli Cohen

On Mon, Jan 26, 2009 at 09:59:26AM -0800, Roland Dreier wrote:
 
 seems like a really strange thing to do:
 
 + umem-address   = addr;
 
 this value addr is coming from the low-level driver, so I'm not clear
 why we need to stick it into umem?  What am I missing?
 

It is has to be saved either at the low level driver's mr object,
e.g. struct mlx4_ib_mr, or at a common place like struct ib_umem. Do
you prefer that it will be saved in struct mlx4_ib_mr?
___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

[ofa-general] Re: [ewg] Re: [PATCH v1] mlx4_ib: Optimize hugetlab pages support

2009-01-25 Thread Eli Cohen

On Thu, Jan 22, 2009 at 09:07:41PM -0800, Roland Dreier wrote:
 
 seems this might underestimate by 1 if the region doesn't start/end on a
 huge-page aligned boundary (but we would still want to use big pages to
 register it).
 

Looks like we must pass the virtual address through struct ib_umem to
the low level driver.

 
 I think we could avoid the uninitialized_var() stuff and having restart
 at all by just doing cur_size = 0 at the start of the loop, and then
 instead of if (restart) just test if cur_size is 0.


initializing cur_size and eliminating restart works fine but cur_addr
still needs this trick. 

I am sending two patches, one for ib_core and one for mlx4.
___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

[ofa-general] [PATCH] ib_core: save process's virtual address in struct ib_umem

2009-01-25 Thread Eli Cohen

add address field to struct ib_umem so low level drivers will have this
information which may be needed in order to correctly calculate the number of
huge pages.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/infiniband/core/umem.c |1 +
 include/rdma/ib_umem.h |1 +
 2 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 6f7c096..4c076c4 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -102,6 +102,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, 
unsigned long addr,
umem-context   = context;
umem-length= size;
umem-offset= addr  ~PAGE_MASK;
+   umem-address   = addr;
umem-page_size = PAGE_SIZE;
/*
 * We ask for writable memory if any access flags other than
diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
index 9ee0d2e..c385bb6 100644
--- a/include/rdma/ib_umem.h
+++ b/include/rdma/ib_umem.h
@@ -43,6 +43,7 @@ struct ib_umem {
struct ib_ucontext *context;
size_t  length;
int offset;
+   unsigned long   address;
int page_size;
int writable;
int hugetlb;
-- 
1.6.1

___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

[ofa-general] [PATCH v1] mlx4_ib: Optimize hugetlab pages support

2009-01-22 Thread Eli Cohen

Since Linux may not merge adjacent pages into a single scatter entry through
calls to dma_map_sg(), we check the special case of hugetlb pages which are
likely to be mapped to coniguous dma addresses and if they are, take advantage
of this. This will result in a significantly lower number of MTT segments used
for registering hugetlb memory regions.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---

In this version I also took care of the case where the kernel is
compiled without hugetlb support.


 drivers/infiniband/hw/mlx4/mr.c |   86 ++-
 1 files changed, 75 insertions(+), 11 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c
index 8e4d26d..4c7a5bf 100644
--- a/drivers/infiniband/hw/mlx4/mr.c
+++ b/drivers/infiniband/hw/mlx4/mr.c
@@ -119,6 +119,68 @@ out:
return err;
 }
 
+static int handle_hugetlb_user_mr(struct ib_pd *pd, struct mlx4_ib_mr *mr,
+ u64 virt_addr, int access_flags)
+{
+#ifdef CONFIG_HUGETLB_PAGE
+   struct mlx4_ib_dev *dev = to_mdev(pd-device);
+   struct ib_umem_chunk *chunk;
+   unsigned dsize;
+   dma_addr_t daddr;
+   unsigned uninitialized_var(cur_size);
+   dma_addr_t uninitialized_var(cur_addr);
+   int restart;
+   int n;
+   struct ib_umem  *umem = mr-umem;
+   u64 *arr;
+   int err = 0;
+   int i;
+   int j = 0;
+
+n = PAGE_ALIGN(umem-length + umem-offset)  HPAGE_SHIFT;
+   arr = kmalloc(n * sizeof *arr, GFP_KERNEL);
+   if (!arr)
+   return -ENOMEM;
+
+   restart = 1;
+   list_for_each_entry(chunk, umem-chunk_list, list)
+   for (i = 0; i  chunk-nmap; ++i) {
+   daddr = sg_dma_address(chunk-page_list[i]);
+   dsize = sg_dma_len(chunk-page_list[i]);
+   if (restart) {
+   cur_addr = daddr;
+   cur_size = dsize;
+   restart = 0;
+   } else if (cur_addr + cur_size != daddr) {
+   err = -EINVAL;
+   goto out;
+   } else
+cur_size += dsize;
+
+   if (cur_size  HPAGE_SIZE) {
+   err = -EINVAL;
+   goto out;
+   } else if (cur_size == HPAGE_SIZE) {
+   restart = 1;
+   arr[j++] = cur_addr;
+   }
+   }
+
+   err = mlx4_mr_alloc(dev-dev, to_mpd(pd)-pdn, virt_addr, umem-length,
+   convert_access(access_flags), n, HPAGE_SHIFT, 
mr-mmr);
+   if (err)
+   goto out;
+
+err = mlx4_write_mtt(dev-dev, mr-mmr.mtt, 0, n, arr);
+
+out:
+   kfree(arr);
+   return err;
+#else
+   return -ENOSYS;
+#endif
+}
+
 struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
  u64 virt_addr, int access_flags,
  struct ib_udata *udata)
@@ -140,17 +202,19 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 
start, u64 length,
goto err_free;
}
 
-   n = ib_umem_page_count(mr-umem);
-   shift = ilog2(mr-umem-page_size);
-
-   err = mlx4_mr_alloc(dev-dev, to_mpd(pd)-pdn, virt_addr, length,
-   convert_access(access_flags), n, shift, mr-mmr);
-   if (err)
-   goto err_umem;
-
-   err = mlx4_ib_umem_write_mtt(dev, mr-mmr.mtt, mr-umem);
-   if (err)
-   goto err_mr;
+   if (!mr-umem-hugetlb || handle_hugetlb_user_mr(pd, mr, virt_addr, 
access_flags)) {
+   n = ib_umem_page_count(mr-umem);
+   shift = ilog2(mr-umem-page_size);
+   
+   err = mlx4_mr_alloc(dev-dev, to_mpd(pd)-pdn, virt_addr, 
length,
+   convert_access(access_flags), n, shift, 
mr-mmr);
+   if (err)
+   goto err_umem;
+   
+   err = mlx4_ib_umem_write_mtt(dev, mr-mmr.mtt, mr-umem);
+   if (err)
+   goto err_mr;
+   }
 
err = mlx4_mr_enable(dev-dev, mr-mmr);
if (err)
-- 
1.6.0.5

___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

[ofa-general] [PATCH] mlx4_ib: Optimize hugetlab pages support

2009-01-20 Thread Eli Cohen

Since Linux does not merge adjacent pages into a single scatter entry through
calls to dma_map_sg(), we do this for huge pages which are guaranteed to be
comprised of adjacent natural pages of size PAGE_SIZE. This will result in a
significantly lower number of MTT segments used for registering hugetlb memory
regions.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
I tried this patch and improves memory scalability. Without it,
increasing the amount of memory used cause caused decrease in
throughput. With this patch there was no drop at all - I used a test
based on MPI with 2 processes.


 drivers/infiniband/hw/mlx4/mr.c |   67 ++
 1 files changed, 60 insertions(+), 7 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c
index 8e4d26d..641ea96 100644
--- a/drivers/infiniband/hw/mlx4/mr.c
+++ b/drivers/infiniband/hw/mlx4/mr.c
@@ -119,6 +119,38 @@ out:
return err;
 }
 
+int mlx4_ib_umem_write_huge_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt,
+  struct ib_umem *umem, int nhpages)
+{
+   struct ib_umem_chunk *chunk;
+   int j, i, k;
+   dma_addr_t *arr;
+   int err;
+
+   arr = kmalloc(nhpages * sizeof *arr, GFP_KERNEL);
+   if (!arr)
+   return -ENOMEM;
+
+   i = 0;
+   k = 0;
+   list_for_each_entry(chunk, umem-chunk_list, list)
+   for (j = 0; j  chunk-nmap; ++j, ++k) {
+   if (!(k   ((1  (HPAGE_SHIFT - PAGE_SHIFT)) - 1))) {
+   if (sg_dma_len(chunk-page_list[j]) != 
PAGE_SIZE) {
+   err = -EAGAIN;
+   goto out;
+   }
+   arr[i++] = sg_dma_address(chunk-page_list[j]);
+   }
+   }
+
+   err = mlx4_write_mtt(dev-dev, mtt, 0, nhpages, arr);
+
+out:
+   kfree(arr);
+   return err;
+}
+
 struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
  u64 virt_addr, int access_flags,
  struct ib_udata *udata)
@@ -128,6 +160,8 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 
start, u64 length,
int shift;
int err;
int n;
+   int nhuge;
+   int shift_huge;
 
mr = kmalloc(sizeof *mr, GFP_KERNEL);
if (!mr)
@@ -142,15 +176,34 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 
start, u64 length,
 
n = ib_umem_page_count(mr-umem);
shift = ilog2(mr-umem-page_size);
-
-   err = mlx4_mr_alloc(dev-dev, to_mpd(pd)-pdn, virt_addr, length,
+   if (mr-umem-hugetlb) {
+   nhuge = ALIGN(n  shift, HPAGE_SIZE)  HPAGE_SHIFT;
+   shift_huge = HPAGE_SHIFT;
+   err = mlx4_mr_alloc(dev-dev, to_mpd(pd)-pdn, virt_addr, 
length,
+   convert_access(access_flags), nhuge, 
shift_huge, mr-mmr);
+   if (err)
+   goto err_umem;
+
+   err = mlx4_ib_umem_write_huge_mtt(dev, mr-mmr.mtt, mr-umem, 
nhuge);
+   if (err) {
+   if (err != -EAGAIN)
+   goto err_mr;
+   else {
+   mlx4_mr_free(to_mdev(pd-device)-dev, 
mr-mmr);
+   goto regular_pages;
+   }
+   }
+   } else {
+regular_pages:
+   err = mlx4_mr_alloc(dev-dev, to_mpd(pd)-pdn, virt_addr, 
length,
convert_access(access_flags), n, shift, mr-mmr);
-   if (err)
-   goto err_umem;
+   if (err)
+   goto err_umem;
 
-   err = mlx4_ib_umem_write_mtt(dev, mr-mmr.mtt, mr-umem);
-   if (err)
-   goto err_mr;
+   err = mlx4_ib_umem_write_mtt(dev, mr-mmr.mtt, mr-umem);
+   if (err)
+   goto err_mr;
+   }
 
err = mlx4_mr_enable(dev-dev, mr-mmr);
if (err)
-- 
1.6.0.5

___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Re: [ofa-general] Re: [PATCH] IPoIB: fix race when manipulating mcast SKBs queue

2008-12-18 Thread Eli Cohen

On Wed, Dec 17, 2008 at 11:12:39AM -0800, Roland Dreier wrote:
 I don't see why this would be required.  When ipoib_mcast_free() runs,
 the mcast structure has been removed from all lists and I don't see how
 any other context could simultaneously be adding packets to pkt_queue.
 What is the race that you think this fixes?
 
I missed the fact that mcasts are moved to remove list so I guess I
there is another problem out there.
___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

[ofa-general] [PATCH] IPoIB: fix race when manipulating mcast SKBs queue

2008-12-17 Thread Eli Cohen

ipoib_mcast_free() dequeues SKBs pending on the pkt_queue but needs to do that
with netif_tx_lock_bh() acquired.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---

I saw the following bug appear in ofed 1.4 on RHAS5.2 but have not yet had the
chance to verify that this patch fixes this particular problem but I think this
patch is valid anyway:

Dec 13 04:57:26 mtilab17 kernel: ib0: dev_queue_xmit failed to requeue packet
Dec 13 04:57:36 mtilab17 kernel: ib0: timing out; 63 sends not completed
Dec 13 04:57:36 mtilab17 kernel: Attempt to release alive inet socket 
8102621d3680
Dec 13 04:57:36 mtilab17 kernel: Attempt to release alive inet socket 
81026ba4b0c0
Dec 13 04:57:36 mtilab17 kernel: BUG: warning at 
include/net/dst.h:153/dst_release() (Tainted: G )
Dec 13 04:57:36 mtilab17 kernel:
Dec 13 04:57:36 mtilab17 kernel: Call Trace:
Dec 13 04:57:36 mtilab17 kernel:  IRQ  [800288ef] 
__kfree_skb+0x47/0x110
Dec 13 04:57:36 mtilab17 kernel:  [885d69ce] 
:ib_ipoib:ipoib_cm_handle_tx_wc+0xc2/0x228
Dec 13 04:57:36 mtilab17 kernel:  [885cfae2] 
:ib_ipoib:ipoib_poll+0xaa/0x19e
Dec 13 04:57:36 mtilab17 kernel:  [8000c54c] net_rx_action+0xa4/0x1a4
Dec 13 04:57:36 mtilab17 kernel:  [8832da0a] 
:mlx4_core:poll_catas+0x0/0x137
---
 drivers/infiniband/ulp/ipoib/ipoib_multicast.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c 
b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index d9d1223..dd320d5 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -94,12 +94,12 @@ static void ipoib_mcast_free(struct ipoib_mcast *mcast)
if (mcast-ah)
ipoib_put_ah(mcast-ah);
 
+   netif_tx_lock_bh(dev);
while (!skb_queue_empty(mcast-pkt_queue)) {
++tx_dropped;
dev_kfree_skb_any(skb_dequeue(mcast-pkt_queue));
}
 
-   netif_tx_lock_bh(dev);
dev-stats.tx_dropped += tx_dropped;
netif_tx_unlock_bh(dev);
 
-- 
1.6.0.5

___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Re: [ofa-general] [PATCH] ipoib: null tx/rx_ring skb pointers on free

2008-11-06 Thread Eli Cohen

On Wed, Nov 05, 2008 at 05:23:07PM -0800, [EMAIL PROTECTED] wrote:

Hi Arthur,
looking a the patch I don't understand why it should fix the problem
you're seeing. I suspect we may be hiding the problem.

 
 diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c 
 b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
 index 7b14c2c..8f8650b 100644
 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
 +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
 @@ -200,6 +200,7 @@ static void ipoib_cm_free_rx_ring(struct net_device *dev,
   ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1,
 rx_ring[i].mapping);
   dev_kfree_skb_any(rx_ring[i].skb);
 + rx_ring[i].skb = NULL;
   }
  
   vfree(rx_ring);

This is not needed since the ring is being freed.

 @@ -736,6 +737,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff 
 *skb, struct ipoib_cm_
   if (unlikely(ib_dma_mapping_error(priv-ca, addr))) {
   ++dev-stats.tx_errors;
   dev_kfree_skb_any(skb);
 + tx_req-skb = NULL;
   return;
   }

Here we will never get completion so why do we need this?

  
 @@ -747,6 +749,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff 
 *skb, struct ipoib_cm_
   ++dev-stats.tx_errors;
   ib_dma_unmap_single(priv-ca, addr, skb-len, DMA_TO_DEVICE);
   dev_kfree_skb_any(skb);
 + tx_req-skb = NULL;

Also here, we don't get a completion.

   } else {
   dev-trans_start = jiffies;
   ++tx-tx_head;
 @@ -785,6 +788,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct 
 ib_wc *wc)
   dev-stats.tx_bytes += tx_req-skb-len;
  
   dev_kfree_skb_any(tx_req-skb);
 + tx_req-skb = NULL;
And here we already got the completion so we shouldn't exptect another
free of the SKB.
  
   netif_tx_lock(dev);
  
 @@ -1179,6 +1183,7 @@ timeout:
   ib_dma_unmap_single(priv-ca, tx_req-mapping, tx_req-skb-len,
   DMA_TO_DEVICE);
   dev_kfree_skb_any(tx_req-skb);
 + tx_req-skb = NULL;
and here we're freeing the ring
   ++p-tx_tail;
   netif_tx_lock_bh(p-dev);
   if (unlikely(--priv-tx_outstanding == ipoib_sendq_size  1) 
 diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c 
 b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
 index 28eb6f0..f7e3497 100644
 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
 +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
 @@ -383,6 +383,7 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, 
 struct ib_wc *wc)
   dev-stats.tx_bytes += tx_req-skb-len;
  
   dev_kfree_skb_any(tx_req-skb);
 + tx_req-skb = NULL;
  
   ++priv-tx_tail;
   if (unlikely(--priv-tx_outstanding == ipoib_sendq_size  1) 
 @@ -572,6 +573,7 @@ void ipoib_send(struct net_device *dev, struct sk_buff 
 *skb,
   if (unlikely(ipoib_dma_map_tx(priv-ca, tx_req))) {
   ++dev-stats.tx_errors;
   dev_kfree_skb_any(skb);
 + tx_req-skb = NULL;
   return;
   }
  
 @@ -594,6 +596,7 @@ void ipoib_send(struct net_device *dev, struct sk_buff 
 *skb,
   --priv-tx_outstanding;
   ipoib_dma_unmap_tx(priv-ca, tx_req);
   dev_kfree_skb_any(skb);
 + tx_req-skb = NULL;
   if (netif_queue_stopped(dev))
   netif_wake_queue(dev);
   } else {
 @@ -833,6 +836,7 @@ int ipoib_ib_dev_stop(struct net_device *dev, int flush)
   (ipoib_sendq_size - 1)];
   ipoib_dma_unmap_tx(priv-ca, tx_req);
   dev_kfree_skb_any(tx_req-skb);
 + tx_req-skb = NULL;
   ++priv-tx_tail;
   --priv-tx_outstanding;
   }
 ___
 general mailing list
 general@lists.openfabrics.org
 http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general
 
 To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

1 2 3 4 5 6 >

1 - 100 of 553 matches

Mail list logo