[ofa-general] [PATCH] mlx4: remove limitation on LSO header size
Current code has a limitation as for the size of an LSO header not allowed to cross a 64 byte boundary. This patch removes this limitation by setting the WQE RR for large headers thus allowing LSO headers of any size. The extra buffer reserved for MLX4_IB_QP_LSO QPs has been doubled, from 64 to 128 bytes, assuming this is reasonable upper limit to header length. Also, this patch will cause IB_DEVICE_UD_TSO to be set only of FW versions that set MLX4_DEV_CAP_FLAG_BLH; e.g. FW version 2.6.000 and higher. Signed-off-by: Eli Cohen e...@mellanox.co.il --- drivers/infiniband/hw/mlx4/main.c |2 +- drivers/infiniband/hw/mlx4/qp.c | 17 +++-- include/linux/mlx4/device.h |1 + 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 3cb3f47..e596537 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -103,7 +103,7 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, props-device_cap_flags |= IB_DEVICE_UD_AV_PORT_ENFORCE; if (dev-dev-caps.flags MLX4_DEV_CAP_FLAG_IPOIB_CSUM) props-device_cap_flags |= IB_DEVICE_UD_IP_CSUM; - if (dev-dev-caps.max_gso_sz) + if (dev-dev-caps.max_gso_sz dev-dev-caps.flags MLX4_DEV_CAP_FLAG_BLH) props-device_cap_flags |= IB_DEVICE_UD_TSO; if (dev-dev-caps.bmme_flags MLX4_BMME_FLAG_RESERVED_LKEY) props-device_cap_flags |= IB_DEVICE_LOCAL_DMA_LKEY; diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 219b103..1b356cf 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -261,7 +261,7 @@ static int send_wqe_overhead(enum ib_qp_type type, u32 flags) case IB_QPT_UD: return sizeof (struct mlx4_wqe_ctrl_seg) + sizeof (struct mlx4_wqe_datagram_seg) + - ((flags MLX4_IB_QP_LSO) ? 64 : 0); + ((flags MLX4_IB_QP_LSO) ? 128 : 0); case IB_QPT_UC: return sizeof (struct mlx4_wqe_ctrl_seg) + sizeof (struct mlx4_wqe_raddr_seg); @@ -1467,16 +1467,11 @@ static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg) static int build_lso_seg(struct mlx4_wqe_lso_seg *wqe, struct ib_send_wr *wr, struct mlx4_ib_qp *qp, unsigned *lso_seg_len, -__be32 *lso_hdr_sz) +__be32 *lso_hdr_sz, int *blh) { unsigned halign = ALIGN(sizeof *wqe + wr-wr.ud.hlen, 16); - /* -* This is a temporary limitation and will be removed in -* a forthcoming FW release: -*/ - if (unlikely(halign 64)) - return -EINVAL; + *blh = unlikely(halign 64) ? 1 : 0; if (unlikely(!(qp-flags MLX4_IB_QP_LSO) wr-num_sge qp-sq.max_gs - (halign 4))) @@ -1523,6 +1518,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, __be32 *lso_wqe; __be32 uninitialized_var(lso_hdr_sz); int i; + int blh = 0; spin_lock_irqsave(qp-sq.lock, flags); @@ -1616,7 +1612,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, size += sizeof (struct mlx4_wqe_datagram_seg) / 16; if (wr-opcode == IB_WR_LSO) { - err = build_lso_seg(wqe, wr, qp, seglen, lso_hdr_sz); + err = build_lso_seg(wqe, wr, qp, seglen, lso_hdr_sz, blh); if (unlikely(err)) { *bad_wr = wr; goto out; @@ -1687,7 +1683,8 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, } ctrl-owner_opcode = mlx4_ib_opcode[wr-opcode] | - (ind qp-sq.wqe_cnt ? cpu_to_be32(1 31) : 0); + (ind qp-sq.wqe_cnt ? cpu_to_be32(1 31) : 0) | + (blh ? cpu_to_be32(1 6) : 0); stamp = ind + qp-sq_spare_wqes; ind += DIV_ROUND_UP(size * 16, 1U qp-sq.wqe_shift); diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index ce7cc6c..e92d1bf 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -61,6 +61,7 @@ enum { MLX4_DEV_CAP_FLAG_BAD_PKEY_CNTR = 1 8, MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR = 1 9, MLX4_DEV_CAP_FLAG_DPDP = 1 12, + MLX4_DEV_CAP_FLAG_BLH = 1 15, MLX4_DEV_CAP_FLAG_MEM_WINDOW= 1 16, MLX4_DEV_CAP_FLAG_APM = 1 17, MLX4_DEV_CAP_FLAG_ATOMIC= 1 18, -- 1.6.4.3 ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general
[ofa-general] [PATCH v2] mlx4: configure cache line size
ConnectX can work more efficiently if the CPU cache line size is configured to it at INIT_HCA. This patch configures the CPU cache line size. Signed-off-by: Eli Cohen e...@mellanox.co.il --- As per Roland's comments, the following changes were made: 1. Remove #ifdef cache_line_size and include linux/cache.h 2. Assume cache line size is a power of 2 and use ilog2 instead of order_base_2 drivers/net/mlx4/fw.c |5 + 1 files changed, 5 insertions(+), 0 deletions(-) diff --git a/drivers/net/mlx4/fw.c b/drivers/net/mlx4/fw.c index cee199c..3c16602 100644 --- a/drivers/net/mlx4/fw.c +++ b/drivers/net/mlx4/fw.c @@ -33,6 +33,7 @@ */ #include linux/mlx4/cmd.h +#include linux/cache.h #include fw.h #include icm.h @@ -698,6 +699,7 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param) #define INIT_HCA_IN_SIZE0x200 #define INIT_HCA_VERSION_OFFSET 0x000 #define INIT_HCA_VERSION2 +#define INIT_HCA_CACHELINE_SZ_OFFSET0x0e #define INIT_HCA_FLAGS_OFFSET 0x014 #define INIT_HCA_QPC_OFFSET 0x020 #define INIT_HCA_QPC_BASE_OFFSET(INIT_HCA_QPC_OFFSET + 0x10) @@ -735,6 +737,9 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param) *((u8 *) mailbox-buf + INIT_HCA_VERSION_OFFSET) = INIT_HCA_VERSION; + *((u8 *) mailbox-buf + INIT_HCA_CACHELINE_SZ_OFFSET) = + (ilog2(cache_line_size()) - 4) 5; + #if defined(__LITTLE_ENDIAN) *(inbox + INIT_HCA_FLAGS_OFFSET / 4) = ~cpu_to_be32(1 1); #elif defined(__BIG_ENDIAN) -- 1.6.4.3 ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[ofa-general] Possible process deadlock in RMPP flow
Hi Sean, one of our customers experiences problems when running ibnetdiscover. The problem happens from time to time. Here is the call stack the he gets: ibnetdiscover D 80149b8d 0 26968 26544 (L-TLB) 8102c900bd88 0046 81037e8e 81037e8e02e8 8102c900bd78 000a 8102c5b50820 81038a929820 011837bf6105 0ede 8102c5b50a08 0001 Call Trace: [80064207] wait_for_completion+0x79/0xa2 [8008b4cc] default_wake_function+0x0/0xe [882271d9] :ib_mad:ib_cancel_rmpp_recvs+0x87/0xde [88224485] :ib_mad:ib_unregister_mad_agent+0x30d/0x424 [883983e9] :ib_umad:ib_umad_close+0x9d/0xd6 [80012e22] __fput+0xae/0x198 [80023de6] filp_close+0x5c/0x64 [800393df] put_files_struct+0x63/0xae [80015b26] do_exit+0x31c/0x911 [8004971a] cpuset_exit+0x0/0x6c [8005e116] system_call+0x7e/0x83 From the dump it seems that the process is waits on the call to flush_workqueue() in ib_cancel_rmpp_recvs(). The package they use is OFED 1.4.2. Do you have any idea or suggestions how to sort this out? Thanks. ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[ofa-general] Re: Possible process deadlock in RMPP flow
On Wed, Sep 23, 2009 at 09:08:28AM -0700, Sean Hefty wrote: Roland just submitted a patch in this area yesterday. I don't know if the patch would fix their issue, but it may be worth trying. What kernel does 1.4.2 map to? I think OFED 1.4.2 is based on kernel 2.6.27 but they're using RHEL 5.3. Thanks, we'll try this. What RMPP messages does ibnetdiscover use? If the program is completing successfully, there may be a different race with the rmpp cleanup. I'll see if anything else stands out in that area. - Sean -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[ofa-general] Re: [PATCH] mlx4: confiugre cache line size
On Tue, Sep 22, 2009 at 10:51:10AM -0700, Roland Dreier wrote: I agree with you on both comments. Would you like me to resend or will you make the necessary changes? +#if defined(cache_line_size) Why the #if here? Do we just need to include linux/cache.h explicitly to make sure we get the define? + *((u8 *) mailbox-buf + INIT_HCA_CACHELINE_SZ_OFFSET) = + order_base_2(cache_line_size() / 16) 5; Trivial but I think it's safe to assume a cacheline is always a power of 2. And I think it's clearer (and avoids generating a divide) to use subtraction rather than division... so this could all become: (ilog2(cache_line_size()) - 4) 5; - R. -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[ofa-general] [PATCH] mlx4: confiugre cache line size
ConnectX can work more efficiently if the CPU cache line size is confiugred to it at INIT_HCA. This patch configures cache line size for systems that report it. Signed-off-by: Eli Cohen e...@mellanox.co.il --- drivers/net/mlx4/fw.c |7 +++ 1 files changed, 7 insertions(+), 0 deletions(-) diff --git a/drivers/net/mlx4/fw.c b/drivers/net/mlx4/fw.c index 20526ce..aa38c06 100644 --- a/drivers/net/mlx4/fw.c +++ b/drivers/net/mlx4/fw.c @@ -699,6 +699,7 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param) #define INIT_HCA_IN_SIZE0x200 #define INIT_HCA_VERSION_OFFSET 0x000 #define INIT_HCA_VERSION2 +#define INIT_HCA_CACHELINE_SZ_OFFSET0x0e #define INIT_HCA_FLAGS_OFFSET 0x014 #define INIT_HCA_QPC_OFFSET 0x020 #define INIT_HCA_QPC_BASE_OFFSET(INIT_HCA_QPC_OFFSET + 0x10) @@ -736,6 +737,12 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param) *((u8 *) mailbox-buf + INIT_HCA_VERSION_OFFSET) = INIT_HCA_VERSION; +#if defined(cache_line_size) + *((u8 *) mailbox-buf + INIT_HCA_CACHELINE_SZ_OFFSET) = + order_base_2(cache_line_size() / 16) 5; +#endif + + #if defined(__LITTLE_ENDIAN) *(inbox + INIT_HCA_FLAGS_OFFSET / 4) = ~cpu_to_be32(1 1); #elif defined(__BIG_ENDIAN) -- 1.6.4.3 ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
Re: [ofa-general] Better way to get sufficient EQ context memory?y
On Thu, Aug 20, 2009 at 02:33:41PM -0700, Roland Dreier wrote: Eli, it occurs to me that since we're doing more than one page for EQ context now, we might as well use the normal ICM table stuff that everything else uses. Seems the code becomes much simpler and I don't think there's any real overhead added... thoughts? Yes it look cleaner, and it works well on my 4 core system. Let's wait for Christoph's approval on his system. (Christoph, I tested this with possible_cpus=32 and it still works for me -- if you get a chance on your Dell systems that would be helpful too) ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
Re: [ofa-general] [PATCHv5 0/10] RDMAoE support
Roland, what about this series of patches? Would you like me to re-create them over your xrc branch or would you rather take them before xrc? On Wed, Aug 19, 2009 at 08:19:35PM +0300, Eli Cohen wrote: RDMA over Ethernet (RDMAoE) allows running the IB transport protocol using Ethernet frames, enabling the deployment of IB semantics on lossless Ethernet fabrics. RDMAoE packets are standard Ethernet frames with an IEEE assigned Ethertype, a GRH, unmodified IB transport headers and payload. IB subnet management and SA services are not required for RDMAoE operation; Ethernet management practices are used instead. RDMAoE encodes IP addresses into its GIDs and resolves MAC addresses using the host IP stack. For multicast GIDs, standard IP to MAC mappings apply. To support RDMAoE, a new transport protocol was added to the IB core. An RDMA device can have ports with different transports, which are identified by a port transport attribute. The RDMA Verbs API is syntactically unmodified. When referring to RDMAoE ports, Address handles are required to contain GIDs while LID fields are ignored. The Ethernet L2 information is subsequently obtained by the vendor-specific driver (both in kernel- and user-space) while modifying QPs to RTR and creating address handles. As there is no SA in RDMAoE, the CMA code is modified to fill the necessary path record attributes locally before sending CM packets. Similarly, the CMA provides to the user the required address handle attributes when processing SIDR requests and joining multicast groups. In this patch set, an RDMAoE port is currently assigned a single GID, encoding the IPv6 link-local address of the corresponding netdev; the CMA RDMAoE code temporarily uses IPv6 link-local addresses as GIDs instead of the IP address provided by the user, thereby supporting any IP address. To enable RDMAoE with the mlx4 driver stack, both the mlx4_en and mlx4_ib drivers must be loaded, and the netdevice for the corresponding RDMAoE port must be running. Individual ports of a multi port HCA can be independently configured as Ethernet (with support for RDMAoE) or IB, as is already the case. We have successfully tested MPI, SDP, RDS, and native Verbs applications over RDMAoE. Following is a series of 10 patches based on version 2.6.30 of the Linux kernel. This new series reflects changes based on feedback from the community on the previous set of patches, and is tagged v5. Changes from v4: 1. Added rdma_is_transport_supported() and used it to simplify conditionals throughout the code. 2. ib_register_mad_agent()for QP0 is only called for IB ports 3. PATCH 5/10 changed from Enable support for RDMAoE ports to Enable support only for IB ports. 4. MAD services from userspace currently not supported for RDMAoE ports. 5. Add kref to struct cma_multicast to aid in maintaining reference count on the object. This is to avoid freeing the object while the worker thread is still using it. 6. Return immediate error for invalid MTU when resolving an RDMAoE path 7. Don't fail resolve path if rate is 0 since this value stands for IB_RATE_PORT_CURRENT. 8. In cma_rdmaoe_join_multicast(), fail immediately if mtu is zero. 9. Add ucma_copy_rdmaoe_route()instead of modifying ucma_copy_ib_route(). 10. Bug fix: in PATCH 10/10, call flush_workqueue after unregistering netdev notifiers 11. Multicast no longer use the broadcast MAC. 12. No changes to patches 2, 7 and 8 from the v4 series. Signed-off-by: Eli Cohen e...@mellanox.co.il --- b/drivers/infiniband/core/agent.c | 38 ++- b/drivers/infiniband/core/cm.c | 25 +- b/drivers/infiniband/core/cma.c | 54 ++-- b/drivers/infiniband/core/mad.c | 41 ++- b/drivers/infiniband/core/multicast.c |4 b/drivers/infiniband/core/sa_query.c| 39 ++- b/drivers/infiniband/core/ucm.c |8 b/drivers/infiniband/core/ucma.c|2 b/drivers/infiniband/core/ud_header.c | 111 ++ b/drivers/infiniband/core/user_mad.c|6 b/drivers/infiniband/core/uverbs.h |1 b/drivers/infiniband/core/uverbs_cmd.c | 32 ++ b/drivers/infiniband/core/uverbs_main.c |1 b/drivers/infiniband/core/verbs.c | 25 ++ b/drivers/infiniband/hw/mlx4/ah.c | 187 +--- b/drivers/infiniband/hw/mlx4/mad.c | 32 +- b/drivers/infiniband/hw/mlx4/main.c | 309 +--- b/drivers/infiniband/hw/mlx4/mlx4_ib.h | 19 + b/drivers/infiniband/hw/mlx4/qp.c | 172 ++- b/drivers/infiniband/ulp/ipoib/ipoib_main.c | 12 - b/drivers/net/mlx4/en_main.c| 15 + b/drivers/net/mlx4/en_port.c|4 b/drivers/net/mlx4/en_port.h|3 b/drivers/net/mlx4/fw.c |3 b/drivers/net/mlx4/intf.c
[ofa-general] [PATCHv5 01/10] ib_core: Refine device personality from node type to port type
As a preparation to devices that, in general, support a different transport protocol for each port, specifically RDMAoE, this patch defines a transport type for each of a device's ports. As a result, rdma_node_get_transport() has been unexported and is used internally by the implementation of the new API, rdma_port_get_transport(), which gives the transport protocol of the queried port. rdma_is_transport_supported() is also added to be used for verifying if a given device supports a given protocol on any of its ports. All references to rdma_node_get_transport() are changed to to use the new APIs. Also, ib_port_attr is extended to contain enum rdma_transport_type. Signed-off-by: Eli Cohen e...@mellanox.co.il --- Changes from previous version: Define and make use of rdma_is_transport_supported(), an API that allows the caller to check if a given device supports a given transport protocol on any of its ports. drivers/infiniband/core/cm.c | 25 + drivers/infiniband/core/cma.c | 54 +++-- drivers/infiniband/core/mad.c | 41 ++ drivers/infiniband/core/multicast.c |4 +- drivers/infiniband/core/sa_query.c| 39 + drivers/infiniband/core/ucm.c |8 +++- drivers/infiniband/core/ucma.c|2 +- drivers/infiniband/core/user_mad.c|6 +++- drivers/infiniband/core/verbs.c | 25 - drivers/infiniband/ulp/ipoib/ipoib_main.c | 12 +++--- include/rdma/ib_verbs.h | 11 -- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c |3 +- net/sunrpc/xprtrdma/svc_rdma_transport.c |2 +- 13 files changed, 148 insertions(+), 84 deletions(-) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 5130fc5..d082f59 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -3678,8 +3678,9 @@ static void cm_add_one(struct ib_device *ib_device) unsigned long flags; int ret; u8 i; + enum rdma_transport_type tt; - if (rdma_node_get_transport(ib_device-node_type) != RDMA_TRANSPORT_IB) + if (!rdma_is_transport_supported(ib_device, RDMA_TRANSPORT_IB)) return; cm_dev = kzalloc(sizeof(*cm_dev) + sizeof(*port) * @@ -3700,6 +3701,10 @@ static void cm_add_one(struct ib_device *ib_device) set_bit(IB_MGMT_METHOD_SEND, reg_req.method_mask); for (i = 1; i = ib_device-phys_port_cnt; i++) { + tt = rdma_port_get_transport(ib_device, i); + if (tt != RDMA_TRANSPORT_IB) + continue; + port = kzalloc(sizeof *port, GFP_KERNEL); if (!port) goto error1; @@ -3742,9 +3747,11 @@ error1: port_modify.clr_port_cap_mask = IB_PORT_CM_SUP; while (--i) { port = cm_dev-port[i-1]; - ib_modify_port(ib_device, port-port_num, 0, port_modify); - ib_unregister_mad_agent(port-mad_agent); - cm_remove_port_fs(port); + if (port) { + ib_modify_port(ib_device, port-port_num, 0, port_modify); + ib_unregister_mad_agent(port-mad_agent); + cm_remove_port_fs(port); + } } device_unregister(cm_dev-device); kfree(cm_dev); @@ -3770,10 +3777,12 @@ static void cm_remove_one(struct ib_device *ib_device) for (i = 1; i = ib_device-phys_port_cnt; i++) { port = cm_dev-port[i-1]; - ib_modify_port(ib_device, port-port_num, 0, port_modify); - ib_unregister_mad_agent(port-mad_agent); - flush_workqueue(cm.wq); - cm_remove_port_fs(port); + if (port) { + ib_modify_port(ib_device, port-port_num, 0, port_modify); + ib_unregister_mad_agent(port-mad_agent); + flush_workqueue(cm.wq); + cm_remove_port_fs(port); + } } device_unregister(cm_dev-device); kfree(cm_dev); diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 851de83..02fd045 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -329,24 +329,26 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv) struct cma_device *cma_dev; union ib_gid gid; int ret = -ENODEV; - - switch (rdma_node_get_transport(dev_addr-dev_type)) { - case RDMA_TRANSPORT_IB: - ib_addr_get_sgid(dev_addr, gid); - break; - case RDMA_TRANSPORT_IWARP: - iw_addr_get_sgid(dev_addr, gid); - break; - default: - return -ENODEV; - } + int port; list_for_each_entry(cma_dev, dev_list, list) { - ret
[ofa-general] [PATCHv5 03/10] ib_core: RDMAoE support only QP1
Since RDMAoE is using Ethernet as its link layer, there is no need for QP0. QP1 is still needed since it handles communications between CM agents. This patch will create only QP1 for RDMAoE ports. Signed-off-by: Eli Cohen e...@mellanox.co.il --- Changes from previous version: 1. Instead of returning NULL for unsupported ports (which is no considered an error), now callers of ib_register_mad_agent() must verify that the port/special QP is supported before calling this function. 2. Make use of rdma_is_transport_supported() where appropriate. drivers/infiniband/core/agent.c | 38 +- drivers/infiniband/core/mad.c | 37 + 2 files changed, 54 insertions(+), 21 deletions(-) diff --git a/drivers/infiniband/core/agent.c b/drivers/infiniband/core/agent.c index ae7c288..c130a4a 100644 --- a/drivers/infiniband/core/agent.c +++ b/drivers/infiniband/core/agent.c @@ -48,6 +48,8 @@ struct ib_agent_port_private { struct list_head port_list; struct ib_mad_agent *agent[2]; + struct ib_device*device; + u8 port_num; }; static DEFINE_SPINLOCK(ib_agent_port_list_lock); @@ -58,11 +60,10 @@ __ib_get_agent_port(struct ib_device *device, int port_num) { struct ib_agent_port_private *entry; - list_for_each_entry(entry, ib_agent_port_list, port_list) { - if (entry-agent[0]-device == device - entry-agent[0]-port_num == port_num) + list_for_each_entry(entry, ib_agent_port_list, port_list) + if (entry-device == device entry-port_num == port_num) return entry; - } + return NULL; } @@ -146,6 +147,7 @@ int ib_agent_port_open(struct ib_device *device, int port_num) struct ib_agent_port_private *port_priv; unsigned long flags; int ret; + enum rdma_transport_type tt; /* Create new device info */ port_priv = kzalloc(sizeof *port_priv, GFP_KERNEL); @@ -155,14 +157,17 @@ int ib_agent_port_open(struct ib_device *device, int port_num) goto error1; } - /* Obtain send only MAD agent for SMI QP */ - port_priv-agent[0] = ib_register_mad_agent(device, port_num, - IB_QPT_SMI, NULL, 0, - agent_send_handler, - NULL, NULL); - if (IS_ERR(port_priv-agent[0])) { - ret = PTR_ERR(port_priv-agent[0]); - goto error2; + tt = rdma_port_get_transport(device, port_num); + if (tt == RDMA_TRANSPORT_IB) { + /* Obtain send only MAD agent for SMI QP */ + port_priv-agent[0] = ib_register_mad_agent(device, port_num, + IB_QPT_SMI, NULL, 0, + agent_send_handler, + NULL, NULL); + if (IS_ERR(port_priv-agent[0])) { + ret = PTR_ERR(port_priv-agent[0]); + goto error2; + } } /* Obtain send only MAD agent for GSI QP */ @@ -175,6 +180,9 @@ int ib_agent_port_open(struct ib_device *device, int port_num) goto error3; } + port_priv-device = device; + port_priv-port_num = port_num; + spin_lock_irqsave(ib_agent_port_list_lock, flags); list_add_tail(port_priv-port_list, ib_agent_port_list); spin_unlock_irqrestore(ib_agent_port_list_lock, flags); @@ -182,7 +190,8 @@ int ib_agent_port_open(struct ib_device *device, int port_num) return 0; error3: - ib_unregister_mad_agent(port_priv-agent[0]); + if (tt == RDMA_TRANSPORT_IB) + ib_unregister_mad_agent(port_priv-agent[0]); error2: kfree(port_priv); error1: @@ -194,6 +203,9 @@ int ib_agent_port_close(struct ib_device *device, int port_num) struct ib_agent_port_private *port_priv; unsigned long flags; + if (rdma_port_get_transport(device, port_num) != RDMA_TRANSPORT_IB) + return 0; + spin_lock_irqsave(ib_agent_port_list_lock, flags); port_priv = __ib_get_agent_port(device, port_num); if (port_priv == NULL) { diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index c06117c..aceae79 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -2602,6 +2602,9 @@ static void cleanup_recv_queue(struct ib_mad_qp_info *qp_info) struct ib_mad_private *recv; struct ib_mad_list_head *mad_list; + if (!qp_info-qp) + return; + while (!list_empty(qp_info-recv_queue.list)) { mad_list = list_entry(qp_info-recv_queue.list.next, @@ -2643,6 +2646,9 @@ static int
[ofa-general] [PATCHv5 04/10] IB/umad: Enable support only for IB ports
Initialize umad context for devices that have any of their ports IB. Since devices may have ports of two different protocols (for example, RDMA_TRANSPORT_IB and RDMA_TRANSPORT_RDMAOE), ib_umad_add_one() needs to succeed if any of the ports is IB but ib_umad_init_port() is called only for IB ports. Signed-off-by: Eli Cohen e...@mellanox.co.il --- Changes from last version: 1. Patch title changed from to Enable support for RDMAoE ports to Enable support only for IB ports. 2. Do not allow userspace MADs to RDMAoE ports. drivers/infiniband/core/user_mad.c | 15 +++ 1 files changed, 7 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index aa4eeb3..51888eb 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -1123,10 +1123,6 @@ static void ib_umad_add_one(struct ib_device *device) e = device-phys_port_cnt; } - for (i = s; i = e; ++i) - if (rdma_port_get_transport(device, i) != RDMA_TRANSPORT_IB) - return; - umad_dev = kzalloc(sizeof *umad_dev + (e - s + 1) * sizeof (struct ib_umad_port), GFP_KERNEL); @@ -1141,8 +1137,9 @@ static void ib_umad_add_one(struct ib_device *device) for (i = s; i = e; ++i) { umad_dev-port[i - s].umad_dev = umad_dev; - if (ib_umad_init_port(device, i, umad_dev-port[i - s])) - goto err; + if (rdma_port_get_transport(device, i) == RDMA_TRANSPORT_IB) + if (ib_umad_init_port(device, i, umad_dev-port[i - s])) + goto err; } ib_set_client_data(device, umad_client, umad_dev); @@ -1151,7 +1148,8 @@ static void ib_umad_add_one(struct ib_device *device) err: while (--i = s) - ib_umad_kill_port(umad_dev-port[i - s]); + if (rdma_port_get_transport(device, i) == RDMA_TRANSPORT_IB) + ib_umad_kill_port(umad_dev-port[i - s]); kref_put(umad_dev-ref, ib_umad_release_dev); } @@ -1165,7 +1163,8 @@ static void ib_umad_remove_one(struct ib_device *device) return; for (i = 0; i = umad_dev-end_port - umad_dev-start_port; ++i) - ib_umad_kill_port(umad_dev-port[i]); + if (rdma_port_get_transport(device, i + 1) == RDMA_TRANSPORT_IB) + ib_umad_kill_port(umad_dev-port[i]); kref_put(umad_dev-ref, ib_umad_release_dev); } -- 1.6.4 ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[ofa-general] [PATCHv5 05/10] ib/cm: Enable CM support for RDMAoE
CM messages can be transported on RDMAoE protocol ports so they are enabled here. Signed-off-by: Eli Cohen e...@mellanox.co.il --- Changes from last version: Make use of rdma_is_transport_supported() drivers/infiniband/core/cm.c |5 +++-- drivers/infiniband/core/ucm.c | 12 +--- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index d082f59..c9f9122 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -3680,7 +3680,8 @@ static void cm_add_one(struct ib_device *ib_device) u8 i; enum rdma_transport_type tt; - if (!rdma_is_transport_supported(ib_device, RDMA_TRANSPORT_IB)) + if (!rdma_is_transport_supported(ib_device, RDMA_TRANSPORT_IB) + !rdma_is_transport_supported(ib_device, RDMA_TRANSPORT_RDMAOE)) return; cm_dev = kzalloc(sizeof(*cm_dev) + sizeof(*port) * @@ -3702,7 +3703,7 @@ static void cm_add_one(struct ib_device *ib_device) set_bit(IB_MGMT_METHOD_SEND, reg_req.method_mask); for (i = 1; i = ib_device-phys_port_cnt; i++) { tt = rdma_port_get_transport(ib_device, i); - if (tt != RDMA_TRANSPORT_IB) + if (tt != RDMA_TRANSPORT_IB tt != RDMA_TRANSPORT_RDMAOE) continue; port = kzalloc(sizeof *port, GFP_KERNEL); diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c index b508020..3ce5df2 100644 --- a/drivers/infiniband/core/ucm.c +++ b/drivers/infiniband/core/ucm.c @@ -1240,13 +1240,19 @@ static void ib_ucm_add_one(struct ib_device *device) { struct ib_ucm_device *ucm_dev; int i; + enum rdma_transport_type tt; if (!device-alloc_ucontext) return; - for (i = 1; i = device-phys_port_cnt; ++i) - if (rdma_port_get_transport(device, i) != RDMA_TRANSPORT_IB) - return; + for (i = 1; i = device-phys_port_cnt; ++i) { + tt = rdma_port_get_transport(device, i); + if (tt == RDMA_TRANSPORT_IB || tt == RDMA_TRANSPORT_RDMAOE) + break; + } + + if (i device-phys_port_cnt) + return; ucm_dev = kzalloc(sizeof *ucm_dev, GFP_KERNEL); if (!ucm_dev) -- 1.6.4 ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[ofa-general] [PATCHv5 06/10] ib_core: CMA device binding
Add support for RDMAoE device binding and IP -- GID resolution. Path resolving and multicast joining are implemented within cma.c by filling the responses and pushing the callbacks to the cma work queue. IP-GID resolution always yields IPv6 link local addresses - remote GIDs are derived from the destination MAC address of the remote port. Multicast GIDs are always mapped to multicast MACs as is done in IPv6; addtion/removal of addresses are made by calling dev_mc_add/delete thus causing the netedvice driver to update the corresponding port's configuration. IPv4 multlicast is not supported currently. Some helper functions are added to ib_addr.h. Signed-off-by: Eli Cohen e...@mellanox.co.il --- Changes from last version: 1. Add kref to struct cma_multicast to aid in maintaining reference count on the object. This is to avoid freeing the object while the worker thread is still using it. 2. return an immediate error if we get an invalid mtu in a resolved path 3. Don't fail resolve path if rate is 0 since this value stands for IB_RATE_PORT_CURRENT. 4. In cma_rdmaoe_join_multicast(), fail immediately if mtu is zero. 5. Add ucma_copy_rdmaoe_route() to copy route to userspace instead of modifying ucma_copy_ib_route(). drivers/infiniband/core/cma.c | 207 ++- drivers/infiniband/core/ucma.c | 31 ++ include/rdma/ib_addr.h | 92 ++ 3 files changed, 324 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 02fd045..6e56e27 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -58,6 +58,7 @@ MODULE_LICENSE(Dual BSD/GPL); #define CMA_CM_RESPONSE_TIMEOUT 20 #define CMA_MAX_CM_RETRIES 15 #define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24) +#define RDMAOE_PACKET_LIFETIME 18 static void cma_add_one(struct ib_device *device); static void cma_remove_one(struct ib_device *device); @@ -157,6 +158,7 @@ struct cma_multicast { struct list_headlist; void*context; struct sockaddr_storage addr; + struct kref mcref; }; struct cma_work { @@ -173,6 +175,12 @@ struct cma_ndev_work { struct rdma_cm_eventevent; }; +struct rdmaoe_mcast_work { + struct work_struct work; + struct rdma_id_private *id; + struct cma_multicast*mc; +}; + union cma_ip_addr { struct in6_addr ip6; struct { @@ -290,6 +298,20 @@ static inline void cma_deref_dev(struct cma_device *cma_dev) complete(cma_dev-comp); } +static inline void release_mc(struct kref *kref) +{ + struct cma_multicast *mc = container_of(kref, struct cma_multicast, mcref); + struct rdma_dev_addr *dev_addr = mc-id_priv-id.route.addr.dev_addr; + u8 mac[6]; + + rdma_get_mcast_mac((struct in6_addr *)(mc-multicast.ib-rec.mgid), mac); + rtnl_lock(); + dev_mc_delete(dev_addr-src_dev, mac, 6, 0); + rtnl_unlock(); + kfree(mc-multicast.ib); + kfree(mc); +} + static void cma_detach_from_dev(struct rdma_id_private *id_priv) { list_del(id_priv-list); @@ -340,6 +362,9 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv) case RDMA_TRANSPORT_IWARP: iw_addr_get_sgid(dev_addr, gid); break; + case RDMA_TRANSPORT_RDMAOE: + rdmaoe_addr_get_sgid(dev_addr, gid); + break; default: return -ENODEV; } @@ -568,10 +593,16 @@ static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv, { struct rdma_dev_addr *dev_addr = id_priv-id.route.addr.dev_addr; int ret; + u16 pkey; + + if (rdma_port_get_transport(id_priv-id.device, id_priv-id.port_num) == + RDMA_TRANSPORT_IB) + pkey = ib_addr_get_pkey(dev_addr); + else + pkey = 0x; ret = ib_find_cached_pkey(id_priv-id.device, id_priv-id.port_num, - ib_addr_get_pkey(dev_addr), - qp_attr-pkey_index); + pkey, qp_attr-pkey_index); if (ret) return ret; @@ -601,6 +632,7 @@ int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr, id_priv = container_of(id, struct rdma_id_private, id); switch (rdma_port_get_transport(id_priv-id.device, id_priv-id.port_num)) { case RDMA_TRANSPORT_IB: + case RDMA_TRANSPORT_RDMAOE: if (!id_priv-cm_id.ib || cma_is_ud_ps(id_priv-id.ps)) ret = cma_ib_init_qp_attr(id_priv, qp_attr, qp_attr_mask); else @@ -828,8 +860,17 @@ static void cma_leave_mc_groups(struct rdma_id_private *id_priv) mc
[ofa-general] [PATCHv5 09/10] mlx4: Add support for RDMAoE - address resolution
The following path handles address vectors creation for RDMAoE ports. mlx4 needs the MAC address of the remote node to include it in the WQE of a UD QP or in the QP context of connected QPs. Address resolution is done atomically in the case of a link local address or a multicast GID and otherwise -EINVAL is returned. mlx4 transport packets were changed too to accomodate for RDMAoE. Signed-off-by: Eli Cohen e...@mellanox.co.il --- Changes from previous version: Call ib_register_mad_agent() for RDMA_TRANSPORT_IB type ports. drivers/infiniband/hw/mlx4/ah.c | 187 -- drivers/infiniband/hw/mlx4/mad.c | 32 -- drivers/infiniband/hw/mlx4/mlx4_ib.h | 19 +++- drivers/infiniband/hw/mlx4/qp.c | 172 +-- drivers/net/mlx4/fw.c|3 +- include/linux/mlx4/device.h | 31 ++- include/linux/mlx4/qp.h |8 +- 7 files changed, 347 insertions(+), 105 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c index c75ac94..0a015c3 100644 --- a/drivers/infiniband/hw/mlx4/ah.c +++ b/drivers/infiniband/hw/mlx4/ah.c @@ -31,63 +31,166 @@ */ #include mlx4_ib.h +#include rdma/ib_addr.h +#include linux/inet.h +#include linux/string.h -struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr) +int mlx4_ib_resolve_grh(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah_attr, + u8 *mac, int *is_mcast) { - struct mlx4_dev *dev = to_mdev(pd-device)-dev; - struct mlx4_ib_ah *ah; + struct mlx4_ib_rdmaoe *rdmaoe = dev-rdmaoe; + struct sockaddr_in6 s6 = {0}; + struct net_device *netdev; + int ifidx; - ah = kmalloc(sizeof *ah, GFP_ATOMIC); - if (!ah) - return ERR_PTR(-ENOMEM); + *is_mcast = 0; + spin_lock(rdmaoe-lock); + netdev = rdmaoe-netdevs[ah_attr-port_num - 1]; + if (!netdev) { + spin_unlock(rdmaoe-lock); + return -EINVAL; + } + ifidx = netdev-ifindex; + spin_unlock(rdmaoe-lock); - memset(ah-av, 0, sizeof ah-av); + memcpy(s6.sin6_addr.s6_addr, ah_attr-grh.dgid.raw, sizeof ah_attr-grh); + s6.sin6_family = AF_INET6; + s6.sin6_scope_id = ifidx; + if (rdma_link_local_addr(s6.sin6_addr)) + rdma_get_ll_mac(s6.sin6_addr, mac); + else if (rdma_is_multicast_addr(s6.sin6_addr)) { + rdma_get_mcast_mac(s6.sin6_addr, mac); + *is_mcast = 1; + } else + return -EINVAL; - ah-av.port_pd = cpu_to_be32(to_mpd(pd)-pdn | (ah_attr-port_num 24)); - ah-av.g_slid = ah_attr-src_path_bits; - ah-av.dlid= cpu_to_be16(ah_attr-dlid); - if (ah_attr-static_rate) { - ah-av.stat_rate = ah_attr-static_rate + MLX4_STAT_RATE_OFFSET; - while (ah-av.stat_rate IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET - !(1 ah-av.stat_rate dev-caps.stat_rate_support)) - --ah-av.stat_rate; - } - ah-av.sl_tclass_flowlabel = cpu_to_be32(ah_attr-sl 28); + return 0; +} + +static struct ib_ah *create_ib_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr, + struct mlx4_ib_ah *ah) +{ + struct mlx4_dev *dev = to_mdev(pd-device)-dev; + + ah-av.ib.port_pd = cpu_to_be32(to_mpd(pd)-pdn | (ah_attr-port_num 24)); + ah-av.ib.g_slid = ah_attr-src_path_bits; if (ah_attr-ah_flags IB_AH_GRH) { - ah-av.g_slid |= 0x80; - ah-av.gid_index = ah_attr-grh.sgid_index; - ah-av.hop_limit = ah_attr-grh.hop_limit; - ah-av.sl_tclass_flowlabel |= + ah-av.ib.g_slid |= 0x80; + ah-av.ib.gid_index = ah_attr-grh.sgid_index; + ah-av.ib.hop_limit = ah_attr-grh.hop_limit; + ah-av.ib.sl_tclass_flowlabel |= cpu_to_be32((ah_attr-grh.traffic_class 20) | ah_attr-grh.flow_label); - memcpy(ah-av.dgid, ah_attr-grh.dgid.raw, 16); + memcpy(ah-av.ib.dgid, ah_attr-grh.dgid.raw, 16); + } + + ah-av.ib.dlid= cpu_to_be16(ah_attr-dlid); + if (ah_attr-static_rate) { + ah-av.ib.stat_rate = ah_attr-static_rate + MLX4_STAT_RATE_OFFSET; + while (ah-av.ib.stat_rate IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET + !(1 ah-av.ib.stat_rate dev-caps.stat_rate_support)) + --ah-av.ib.stat_rate; } + ah-av.ib.sl_tclass_flowlabel = cpu_to_be32(ah_attr-sl 28); return ah-ibah; } +static struct ib_ah *create_rdmaoe_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr, + struct mlx4_ib_ah *ah) +{ + struct mlx4_ib_dev *ibdev = to_mdev(pd-device); + struct mlx4_dev *dev = ibdev
[ofa-general] [PATCHv5 0/10] RDMAoE support
RDMA over Ethernet (RDMAoE) allows running the IB transport protocol using Ethernet frames, enabling the deployment of IB semantics on lossless Ethernet fabrics. RDMAoE packets are standard Ethernet frames with an IEEE assigned Ethertype, a GRH, unmodified IB transport headers and payload. IB subnet management and SA services are not required for RDMAoE operation; Ethernet management practices are used instead. RDMAoE encodes IP addresses into its GIDs and resolves MAC addresses using the host IP stack. For multicast GIDs, standard IP to MAC mappings apply. To support RDMAoE, a new transport protocol was added to the IB core. An RDMA device can have ports with different transports, which are identified by a port transport attribute. The RDMA Verbs API is syntactically unmodified. When referring to RDMAoE ports, Address handles are required to contain GIDs while LID fields are ignored. The Ethernet L2 information is subsequently obtained by the vendor-specific driver (both in kernel- and user-space) while modifying QPs to RTR and creating address handles. As there is no SA in RDMAoE, the CMA code is modified to fill the necessary path record attributes locally before sending CM packets. Similarly, the CMA provides to the user the required address handle attributes when processing SIDR requests and joining multicast groups. In this patch set, an RDMAoE port is currently assigned a single GID, encoding the IPv6 link-local address of the corresponding netdev; the CMA RDMAoE code temporarily uses IPv6 link-local addresses as GIDs instead of the IP address provided by the user, thereby supporting any IP address. To enable RDMAoE with the mlx4 driver stack, both the mlx4_en and mlx4_ib drivers must be loaded, and the netdevice for the corresponding RDMAoE port must be running. Individual ports of a multi port HCA can be independently configured as Ethernet (with support for RDMAoE) or IB, as is already the case. We have successfully tested MPI, SDP, RDS, and native Verbs applications over RDMAoE. Following is a series of 10 patches based on version 2.6.30 of the Linux kernel. This new series reflects changes based on feedback from the community on the previous set of patches, and is tagged v5. Changes from v4: 1. Added rdma_is_transport_supported() and used it to simplify conditionals throughout the code. 2. ib_register_mad_agent()for QP0 is only called for IB ports 3. PATCH 5/10 changed from Enable support for RDMAoE ports to Enable support only for IB ports. 4. MAD services from userspace currently not supported for RDMAoE ports. 5. Add kref to struct cma_multicast to aid in maintaining reference count on the object. This is to avoid freeing the object while the worker thread is still using it. 6. Return immediate error for invalid MTU when resolving an RDMAoE path 7. Don't fail resolve path if rate is 0 since this value stands for IB_RATE_PORT_CURRENT. 8. In cma_rdmaoe_join_multicast(), fail immediately if mtu is zero. 9. Add ucma_copy_rdmaoe_route()instead of modifying ucma_copy_ib_route(). 10. Bug fix: in PATCH 10/10, call flush_workqueue after unregistering netdev notifiers 11. Multicast no longer use the broadcast MAC. 12. No changes to patches 2, 7 and 8 from the v4 series. Signed-off-by: Eli Cohen e...@mellanox.co.il --- b/drivers/infiniband/core/agent.c | 38 ++- b/drivers/infiniband/core/cm.c | 25 +- b/drivers/infiniband/core/cma.c | 54 ++-- b/drivers/infiniband/core/mad.c | 41 ++- b/drivers/infiniband/core/multicast.c |4 b/drivers/infiniband/core/sa_query.c| 39 ++- b/drivers/infiniband/core/ucm.c |8 b/drivers/infiniband/core/ucma.c|2 b/drivers/infiniband/core/ud_header.c | 111 ++ b/drivers/infiniband/core/user_mad.c|6 b/drivers/infiniband/core/uverbs.h |1 b/drivers/infiniband/core/uverbs_cmd.c | 32 ++ b/drivers/infiniband/core/uverbs_main.c |1 b/drivers/infiniband/core/verbs.c | 25 ++ b/drivers/infiniband/hw/mlx4/ah.c | 187 +--- b/drivers/infiniband/hw/mlx4/mad.c | 32 +- b/drivers/infiniband/hw/mlx4/main.c | 309 +--- b/drivers/infiniband/hw/mlx4/mlx4_ib.h | 19 + b/drivers/infiniband/hw/mlx4/qp.c | 172 ++- b/drivers/infiniband/ulp/ipoib/ipoib_main.c | 12 - b/drivers/net/mlx4/en_main.c| 15 + b/drivers/net/mlx4/en_port.c|4 b/drivers/net/mlx4/en_port.h|3 b/drivers/net/mlx4/fw.c |3 b/drivers/net/mlx4/intf.c | 20 + b/drivers/net/mlx4/main.c |6 b/drivers/net/mlx4/mlx4.h |1 b/include/linux/mlx4/cmd.h |1 b/include/linux/mlx4/device.h | 31 ++ b/include/linux/mlx4/driver.h | 16 + b/include/linux
Re: [ofa-general] mlx4: device driver tries to sync DMA memory it has not allocated
On Mon, Aug 10, 2009 at 01:30:44PM -0700, Roland Dreier wrote: Looking at mlx4_write_mtt_chunk() I see that it calls mlx4_table_find() with a pointer to single dma_addr_t - dma_handle - while the dma addresses for the ICM memory is actually a list of different addresses covering possibly different sizes. I think mlx4_table_find() should be changed to support that, and then we can use calls to dma_sync_single_for_cpu()/dma_sync_single_for_device() with the correct dma addresses. No, I think we're careful that we write MTT ranges that don't cross a page so there shouldn't be any problem. But a contiguous ICM memory does not map, in general, to a contiguous DMA memory, so if dma_sync_single_for_*() does not harm anything than it does not do anything useful either. ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
Re: [ofa-general] mlx4: device driver tries to sync DMA memory it has not allocated
On Tue, Aug 11, 2009 at 11:23:48PM -0700, Roland Dreier wrote: Maybe I'm missing your point, but mlx4_table_find() does go to some trouble to find the right DMA address for the object being looked up. Of course it could be buggy but I still don't see why we would need a list of DMA addresses when we know we are only going to sync part of one page? Is this is always true? What if you allocated an ICM buffer that uses none adjacent pages? In this case you would need more than one call to dma_sync_single_for_*(), isn't it? I think the right thing to do to fix this is to switch to using map_single instead of map_sg, and then use dma_sync_single_range_for_xxx to sync the subset we care about. - R. ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
Re: [ofa-general] [PATCHv4 06/10] ib_core: CMA device binding
On Mon, Aug 10, 2009 at 12:31:19PM -0700, Sean Hefty wrote: @@ -576,10 +586,16 @@ static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv, { struct rdma_dev_addr *dev_addr = id_priv-id.route.addr.dev_addr; int ret; +u16 pkey; + +if (rdma_port_get_transport(id_priv-id.device, id_priv-id.port_num) == nit: It looks like the if is indented by spaces, instead of a tab. Will fix, thanks. +static int cma_resolve_rdmaoe_route(struct rdma_id_private *id_priv) +{ +work-old_state = CMA_ROUTE_QUERY; +work-new_state = CMA_ROUTE_RESOLVED; +if (!route-path_rec-mtu || !route-path_rec-rate) { +work-event.event = RDMA_CM_EVENT_ROUTE_ERROR; +work-event.status = -1; Any reason not to fail immediately here and leave the id state unchanged? No real reason. Immediate failure is just as good as a deffered one. I will change that and also remove change the rule to ommit !route-path_rec-rate. +} else { +work-event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; +work-event.status = 0; +} + +queue_work(cma_wq, work-work); + +kfree(mw); +} + +static int cma_rdmaoe_join_multicast(struct rdma_id_private *id_priv, + struct cma_multicast *mc) +{ +struct rdmaoe_mcast_work *work; +struct rdma_dev_addr *dev_addr = id_priv-id.route.addr.dev_addr; + +if (cma_zero_addr((struct sockaddr *)mc-addr)) +return -EINVAL; + +work = kzalloc(sizeof *work, GFP_KERNEL); +if (!work) +return -ENOMEM; + +mc-multicast.ib = kzalloc(sizeof(struct ib_sa_multicast), GFP_KERNEL); +if (!mc-multicast.ib) { +kfree(work); +return -ENOMEM; +} nit: I'd prefer to goto a common cleanup area to make it easier to add changes in the future. Will change that. + +cma_set_mgid(id_priv, (struct sockaddr *)mc-addr, mc-multicast.ib- rec.mgid); +mc-multicast.ib-rec.pkey = cpu_to_be16(0x); +if (id_priv-id.ps == RDMA_PS_UDP) +mc-multicast.ib-rec.qkey = cpu_to_be32(RDMA_UDP_QKEY); +mc-multicast.ib-rec.rate = rdmaoe_get_rate(dev_addr-src_dev); +mc-multicast.ib-rec.hop_limit = 1; +mc-multicast.ib-rec.mtu = rdmaoe_get_mtu(dev_addr-src_dev-mtu); Do we need to check the rate/mtu here, like in resolve route? Or should we be good since we could successfully resolve the route? Actually, can we just read the data from the path record that gets stored with the id? I believe that querying the mtu again to get an up to date vlaue will be better, plus adding a check for the mtu to be none zero or returning immediately with -EINVAL +rdmaoe_addr_get_sgid(dev_addr, mc-multicast.ib-rec.port_gid); +work-id = id_priv; +work-mc = mc; +INIT_WORK(work-work, rdmaoe_mcast_work_handler); + +queue_work(cma_wq, work-work); + +return 0; +} + int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, void *context) { @@ -2782,6 +2918,9 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, case RDMA_TRANSPORT_IB: ret = cma_join_ib_multicast(id_priv, mc); break; +case RDMA_TRANSPORT_RDMAOE: +ret = cma_rdmaoe_join_multicast(id_priv, mc); +break; default: ret = -ENOSYS; break; @@ -2793,6 +2932,7 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, spin_unlock_irq(id_priv-lock); kfree(mc); } + return ret; } EXPORT_SYMBOL(rdma_join_multicast); port_num)) { +tt = rdma_port_get_transport(ctx-cm_id-device, ctx-cm_id-port_num); +switch (tt) { case RDMA_TRANSPORT_IB: -ucma_copy_ib_route(resp, ctx-cm_id-route); +case RDMA_TRANSPORT_RDMAOE: +ucma_copy_ib_route(resp, ctx-cm_id-route, tt); It seems simpler to just add a new call ucma_copy_rdmaoe_route, rather than merging those two transports into a single copy function that then branches based on the transport. Agree, will change. break; default: break; diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h index 483057b..66a848e 100644 --- a/include/rdma/ib_addr.h +++ b/include/rdma/ib_addr.h @@ -39,6 +39,8 @@ #include linux/netdevice.h #include linux/socket.h #include rdma/ib_verbs.h +#include linux/ethtool.h +#include rdma/ib_pack.h struct rdma_addr_client { atomic_t refcount; @@ -157,4 +159,89 @@ static inline void iw_addr_get_dgid(struct rdma_dev_addr *dev_addr, memcpy(gid, dev_addr-dst_dev_addr, sizeof *gid); } + +static inline int rdma_link_local_addr(struct in6_addr *addr) +{ +if (addr-s6_addr32[0] == cpu_to_be32(0xfe80) +addr-s6_addr32[1] == 0) +return 1; +else +return 0; +} just replace the 'if'
Re: [ofa-general] mlx4: device driver tries to sync DMA memory it has not allocated
Looking at mlx4_write_mtt_chunk() I see that it calls mlx4_table_find() with a pointer to single dma_addr_t - dma_handle - while the dma addresses for the ICM memory is actually a list of different addresses covering possibly different sizes. I think mlx4_table_find() should be changed to support that, and then we can use calls to dma_sync_single_for_cpu()/dma_sync_single_for_device() with the correct dma addresses. Roland, what do you think? On Sat, Aug 08, 2009 at 07:49:22PM +0200, Bart Van Assche wrote: Hello, Has anyone ever encountered a message like the one below ? This message was generated while booting a 2.6.30.4 kernel with CONFIG_DMA_API_DEBUG=y and before any out-of-tree kernel modules were loaded. [ cut here ] WARNING: at lib/dma-debug.c:635 check_sync+0x47c/0x4b0() Hardware name: P5Q DELUXE mlx4_core :01:00.0: DMA-API: device driver tries to sync DMA memory it has not allocated [device address=0x000139482000] [size=4096 bytes] Modules linked in: snd_hda_codec_atihdmi snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm snd_timer snd rtc_cmos soundcore i2c_i801 rtc_core hid_belkin mlx4_core( +) rtc_lib sr_mod sg snd_page_alloc pcspkr button intel_agp i2c_core joydev serio_raw cdrom usbhid hid raid456 raid6_pq async_xor async_memcpy async_tx xor raid0 sd_mod crc_t10dif ehci_hcd uhci_hcd usbcore edd raid1 ext3 mbcache jbd fan ide_pci_generic ide_core ata_generic ata_piix pata_marvell ahci libata scsi_mod thermal processor thermal_sys hwmon Pid: 1325, comm: work_for_cpu Not tainted 2.6.30.4-scst-debug #6 Call Trace: [8039bc7c] ? check_sync+0x47c/0x4b0 [80248b48] warn_slowpath_common+0x78/0xd0 [80248bfc] warn_slowpath_fmt+0x3c/0x40 [80517769] ? _spin_lock_irqsave+0x49/0x60 [8039b8ab] ? check_sync+0xab/0x4b0 [8039bc7c] check_sync+0x47c/0x4b0 [802724ac] ? mark_held_locks+0x6c/0x90 [8039be1d] debug_dma_sync_single_for_cpu+0x1d/0x20 [a024a969] mlx4_write_mtt+0x159/0x1e0 [mlx4_core] [a0243c02] mlx4_create_eq+0x222/0x650 [mlx4_core] [8027281d] ? trace_hardirqs_on+0xd/0x10 [a02441f5] mlx4_init_eq_table+0x1c5/0x4a0 [mlx4_core] [a0248b08] mlx4_setup_hca+0x98/0x550 [mlx4_core] [a0249891] ? __mlx4_init_one+0x8d1/0x920 [mlx4_core] [a0249331] __mlx4_init_one+0x371/0x920 [mlx4_core] [a024df18] mlx4_init_one+0x22/0x44 [mlx4_core] [8025cd90] ? do_work_for_cpu+0x0/0x30 [803a43e2] local_pci_probe+0x12/0x20 [8025cda3] do_work_for_cpu+0x13/0x30 [802613e6] kthread+0x56/0x90 [8020cffa] child_rip+0xa/0x20 [8020c9c0] ? restore_args+0x0/0x30 [80261390] ? kthread+0x0/0x90 [8020cff0] ? child_rip+0x0/0x20 ---[ end trace 4480af29bc755c6a ]--- Bart. ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
Re: [ofa-general] [PATCHv4 0/10] RDMAoE support
On Mon, Aug 10, 2009 at 09:45:58AM -0400, Hal Rosenstock wrote: How is port configuration (RDMAoE v. IB) accomplished ? Is it prior to boot time or dynamic ? mlx4 allows changing port designation dynamically by writing to the sysfs. ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
Re: [ofa-general] [PATCHv4 10/10] mlx4: Add RDMAoE support - allow interfaces to correspond to each other
On Wed, Aug 05, 2009 at 02:42:59PM -0600, Jason Gunthorpe wrote: What about multicast though? Switches are going to have trouble with group membership lists for non IP packets.. Even just sending a ICMPv6 packet (with an IPv6 ethertype) isn't guaranteed to fix it. In this patch set, all multicast packets use the broadcast mac. We will address this issue at a future time. ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
Re: [ofa-general] [PATCHv4 01/10] ib_core: Refine device personality from node type to port type
On Wed, Aug 05, 2009 at 01:43:12PM -0700, Sean Hefty wrote: Can resources (PDs, CQs, MRs, etc.) between the different transports be shared? Does QP failover between transports work? There is nothing in the architecture that precludes this; we are not currently focusing on this. Did you consider modifying rdma_node_get_transport_s_() and returning a bitmask of the supported transports available on the device? I'm wondering if something like this makes sense, to allow skipping devices that are not of interest to a particular module. This would be in addition to the rdma_port_get_transport call. There's just a lot of new checks to handle the transport on a port by port basis. We can use a function: rdma_is_transport_supported(ibdev, transport), which will return true if at least one port runs the given transport. Thus, as long as we have only a few transports, these checks will amount to 1-2 lines of code in each module. ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
Re: [ofa-general] [PATCHv4 01/10] ib_core: Refine device personality from node type to port type
On Thu, Aug 06, 2009 at 10:34:19AM -0700, Sean Hefty wrote: Does the implementation allow this? Right now PDs, CQs, etc are allocated per device, not per port. I'm not immediately concerned about QP failover. However, I believe there needs to be some level of coordination between the Infiniband side of the CM and the Ethernet side of the CM, since QPs are associated with CA GUIDs. I'm just trying to understand the impact of this coordination. There is nothing in the implementation to prevent it. We did not see a reason to. The ports share a common node GUID but each one has its own GIDs. ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
Re: [ofa-general] [PATCHv4 04/10] IB/umad: Enable support for RDMAoE ports
On Thu, Aug 06, 2009 at 11:05:47AM -0700, Sean Hefty wrote: Is there a need to expose QP1 to user space? The CM is in the kernel, and there's not an SA. Good point. There seems to be no reason to expose it. Will fix. ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
Re: [ofa-general] [PATCHv4 03/10] ib_core: RDMAoE support only QP1
On Thu, Aug 06, 2009 at 10:52:34AM -0700, Sean Hefty wrote: +/* Validate device and port */ +port_priv = ib_get_mad_port(device, port_num); +if (!port_priv) { +ret = ERR_PTR(-ENODEV); +goto error1; +} + +if (!port_priv-qp_info[qp_type].qp) +return NULL; It seems odd that the first if has 'goto error1', but the second if simply returns NULL. The original intention was to release the caller from the need to decide whether to register the mad agent or not and so the NULL returned would not be treated as error. Thinking it over I realize that it would be better to let the caller decide (according to the port protocol) whether or not to register the mad agent. Will fix. @@ -556,6 +559,9 @@ int ib_unregister_mad_agent(struct ib_mad_agent *mad_agent) struct ib_mad_agent_private *mad_agent_priv; struct ib_mad_snoop_private *mad_snoop_priv; +if (!mad_agent) +return 0; Why would a kernel client call ib_unregister_mad_agent with a NULL pointer? Same as above. Goes away after the fix. cq_size = (IB_MAD_QP_SEND_SIZE + IB_MAD_QP_RECV_SIZE) * 2; +has_smi = rdma_port_get_transport(device, port_num) == RDMA_TRANSPORT_IB; +if (has_smi) +cq_size *= 2; cq_size is doubled twice This is a bug - I'll fix it - thanks. I really wish there were a cleaner way to add this support that didn't involve adding so many checks throughout the code. It's hard to know if checks were added in all the places that were needed. I can't think of a clever way to handle QP 0. The fix discussed above will eliminate a good portion of these checks. ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[ofa-general] [PATCHv4 0/10] RDMAoE support
RDMA over Ethernet (RDMAoE) allows running the IB transport protocol using Ethernet frames, enabling the deployment of IB semantics on lossless Ethernet fabrics. RDMAoE packets are standard Ethernet frames with an IEEE assigned Ethertype, a GRH, unmodified IB transport headers and payload. IB subnet management and SA services are not required for RDMAoE operation; Ethernet management practices are used instead. RDMAoE encodes IP addresses into its GIDs and resolves MAC addresses using the host IP stack. For multicast GIDs, standard IP to MAC mappings apply. To support RDMAoE, a new transport protocol was added to the IB core. An RDMA device can have ports with different transports, which are identified by a port transport attribute. The RDMA Verbs API is syntactically unmodified. When referring to RDMAoE ports, Address handles are required to contain GIDs while LID fields are ignored. The Ethernet L2 information is subsequently obtained by the vendor-specific driver (both in kernel- and user-space) while modifying QPs to RTR and creating address handles. As there is no SA in RDMAoE, the CMA code is modified to fill the necessary path record attributes locally before sending CM packets. Similarly, the CMA provides to the user the required address handle attributes when processing SIDR requests and joining multicast groups. In this patch set, an RDMAoE port is currently assigned a single GID, encoding the IPv6 link-local address of the corresponding netdev; the CMA RDMAoE code temporarily uses IPv6 link-local addresses as GIDs instead of the IP address provided by the user, thereby supporting any IP address. In addition, multicast packets currently use the broadcast MAC. To enable RDMAoE with the mlx4 driver stack, both the mlx4_en and mlx4_ib drivers must be loaded, and the netdevice for the corresponding RDMAoE port must be running. Individual ports of a multi port HCA can be independently configured as Ethernet (with support for RDMAoE) or IB, as is already the case. We have successfully tested MPI, SDP, RDS, and native Verbs applications over RDMAoE. Following is a series of 10 patches based on version 2.6.30 of the Linux kernel. This new series reflects changes based on feedback from the community on the previous set of patches, and is tagged v4. Changes from v3: 1. RDMA transport is determined on a per-port basis instead of the link-type notion. 2. SA services are not provided for RDMAoE clients. CMA code is modified to support RDMAoE transport types. 3. For brevity, GID to MAC resolution is currently restricted to link local addresses. Signed-off-by: Eli Cohen e...@mellanox.co.il --- b/drivers/infiniband/core/agent.c | 12 - b/drivers/infiniband/core/cm.c | 26 +- b/drivers/infiniband/core/cma.c | 54 ++-- b/drivers/infiniband/core/mad.c | 42 ++- b/drivers/infiniband/core/multicast.c |5 b/drivers/infiniband/core/sa_query.c| 40 ++- b/drivers/infiniband/core/ucm.c |8 b/drivers/infiniband/core/ucma.c|2 b/drivers/infiniband/core/ud_header.c | 111 ++ b/drivers/infiniband/core/user_mad.c|7 b/drivers/infiniband/core/uverbs.h |1 b/drivers/infiniband/core/uverbs_cmd.c | 32 ++ b/drivers/infiniband/core/uverbs_main.c |1 b/drivers/infiniband/core/verbs.c | 12 - b/drivers/infiniband/hw/mlx4/ah.c | 187 +--- b/drivers/infiniband/hw/mlx4/main.c | 309 +--- b/drivers/infiniband/hw/mlx4/mlx4_ib.h | 19 + b/drivers/infiniband/hw/mlx4/qp.c | 172 ++- b/drivers/infiniband/ulp/ipoib/ipoib_main.c | 12 - b/drivers/net/mlx4/en_main.c| 15 + b/drivers/net/mlx4/en_port.c|4 b/drivers/net/mlx4/en_port.h|3 b/drivers/net/mlx4/fw.c |3 b/drivers/net/mlx4/intf.c | 20 + b/drivers/net/mlx4/main.c |6 b/drivers/net/mlx4/mlx4.h |1 b/include/linux/mlx4/cmd.h |1 b/include/linux/mlx4/device.h | 31 ++ b/include/linux/mlx4/driver.h | 16 + b/include/linux/mlx4/qp.h |8 b/include/rdma/ib_addr.h| 87 +++ b/include/rdma/ib_pack.h| 26 ++ b/include/rdma/ib_user_verbs.h | 21 + b/include/rdma/ib_verbs.h |9 b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c |3 b/net/sunrpc/xprtrdma/svc_rdma_transport.c |2 drivers/infiniband/core/cm.c|2 drivers/infiniband/core/cma.c | 150 + drivers/infiniband/core/mad.c | 55 +++- drivers/infiniband/core/ucm.c | 12 - drivers/infiniband/core/ucma.c | 25 +- drivers/infiniband/core/user_mad.c | 27
[ofa-general] [PATCHv4 01/10] ib_core: Refine device personality from node type to port type
As a preparation to devices that, in general, support different transport protocol for each port, specifically RDMAoE, this patch defines transport type for each of a device's ports. As a result rdma_node_get_transport() has been unexported and is used internally by the implementation of the new API, rdma_port_get_transport() which gives the transport protocol of the queried port. All references to rdma_node_get_transport() are changed to to use rdma_port_get_transport(). Also, ib_port_attr is extended to contain enum rdma_transport_type. Signed-off-by: Eli Cohen e...@mellanox.co.il --- drivers/infiniband/core/cm.c | 26 - drivers/infiniband/core/cma.c | 54 +++-- drivers/infiniband/core/mad.c | 42 +- drivers/infiniband/core/multicast.c |5 +-- drivers/infiniband/core/sa_query.c| 40 - drivers/infiniband/core/ucm.c |8 +++- drivers/infiniband/core/ucma.c|2 +- drivers/infiniband/core/user_mad.c|7 ++-- drivers/infiniband/core/verbs.c | 12 +- drivers/infiniband/ulp/ipoib/ipoib_main.c | 12 +++--- include/rdma/ib_verbs.h |9 +++-- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c |3 +- net/sunrpc/xprtrdma/svc_rdma_transport.c |2 +- 13 files changed, 128 insertions(+), 94 deletions(-) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 5130fc5..f930f1d 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -3678,9 +3678,7 @@ static void cm_add_one(struct ib_device *ib_device) unsigned long flags; int ret; u8 i; - - if (rdma_node_get_transport(ib_device-node_type) != RDMA_TRANSPORT_IB) - return; + enum rdma_transport_type tt; cm_dev = kzalloc(sizeof(*cm_dev) + sizeof(*port) * ib_device-phys_port_cnt, GFP_KERNEL); @@ -3700,6 +3698,10 @@ static void cm_add_one(struct ib_device *ib_device) set_bit(IB_MGMT_METHOD_SEND, reg_req.method_mask); for (i = 1; i = ib_device-phys_port_cnt; i++) { + tt = rdma_port_get_transport(ib_device, i); + if (tt != RDMA_TRANSPORT_IB) + continue; + port = kzalloc(sizeof *port, GFP_KERNEL); if (!port) goto error1; @@ -3742,9 +3744,11 @@ error1: port_modify.clr_port_cap_mask = IB_PORT_CM_SUP; while (--i) { port = cm_dev-port[i-1]; - ib_modify_port(ib_device, port-port_num, 0, port_modify); - ib_unregister_mad_agent(port-mad_agent); - cm_remove_port_fs(port); + if (port) { + ib_modify_port(ib_device, port-port_num, 0, port_modify); + ib_unregister_mad_agent(port-mad_agent); + cm_remove_port_fs(port); + } } device_unregister(cm_dev-device); kfree(cm_dev); @@ -3770,10 +3774,12 @@ static void cm_remove_one(struct ib_device *ib_device) for (i = 1; i = ib_device-phys_port_cnt; i++) { port = cm_dev-port[i-1]; - ib_modify_port(ib_device, port-port_num, 0, port_modify); - ib_unregister_mad_agent(port-mad_agent); - flush_workqueue(cm.wq); - cm_remove_port_fs(port); + if (port) { + ib_modify_port(ib_device, port-port_num, 0, port_modify); + ib_unregister_mad_agent(port-mad_agent); + flush_workqueue(cm.wq); + cm_remove_port_fs(port); + } } device_unregister(cm_dev-device); kfree(cm_dev); diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index aa62101..866ff7f 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -337,24 +337,26 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv) struct cma_device *cma_dev; union ib_gid gid; int ret = -ENODEV; - - switch (rdma_node_get_transport(dev_addr-dev_type)) { - case RDMA_TRANSPORT_IB: - ib_addr_get_sgid(dev_addr, gid); - break; - case RDMA_TRANSPORT_IWARP: - iw_addr_get_sgid(dev_addr, gid); - break; - default: - return -ENODEV; - } + int port; list_for_each_entry(cma_dev, dev_list, list) { - ret = ib_find_cached_gid(cma_dev-device, gid, -id_priv-id.port_num, NULL); - if (!ret) { - cma_attach_to_dev(id_priv, cma_dev); - break; + for (port = 1; port = cma_dev-device-phys_port_cnt; ++port) { + switch (rdma_port_get_transport
[ofa-general] [PATCHv4 02/10] ib_core: Add RDMAoE transport protocol
Add a new transport protocol, RDMAoE, used for transporting Infiniband traffic over Ethernet fabrics. Signed-off-by: Eli Cohen e...@mellanox.co.il --- include/rdma/ib_verbs.h |3 ++- 1 files changed, 2 insertions(+), 1 deletions(-) diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index b557129..4eec70f 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -69,7 +69,8 @@ enum rdma_node_type { enum rdma_transport_type { RDMA_TRANSPORT_IB, - RDMA_TRANSPORT_IWARP + RDMA_TRANSPORT_IWARP, + RDMA_TRANSPORT_RDMAOE }; enum ib_device_cap_flags { -- 1.6.3.3 ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[ofa-general] [PATCHv4 03/10] ib_core: RDMAoE support only QP1
Since RDMAoE is using Ethernet as its link layer, there is no need for QP0. QP1 is still needed since it handles communications between CM agents. This patch will create only QP1 for RDMAoE ports. Signed-off-by: Eli Cohen e...@mellanox.co.il --- drivers/infiniband/core/agent.c | 12 +--- drivers/infiniband/core/mad.c | 55 +-- 2 files changed, 49 insertions(+), 18 deletions(-) diff --git a/drivers/infiniband/core/agent.c b/drivers/infiniband/core/agent.c index ae7c288..c3f2048 100644 --- a/drivers/infiniband/core/agent.c +++ b/drivers/infiniband/core/agent.c @@ -48,6 +48,8 @@ struct ib_agent_port_private { struct list_head port_list; struct ib_mad_agent *agent[2]; + struct ib_device*device; + u8 port_num; }; static DEFINE_SPINLOCK(ib_agent_port_list_lock); @@ -58,11 +60,10 @@ __ib_get_agent_port(struct ib_device *device, int port_num) { struct ib_agent_port_private *entry; - list_for_each_entry(entry, ib_agent_port_list, port_list) { - if (entry-agent[0]-device == device - entry-agent[0]-port_num == port_num) + list_for_each_entry(entry, ib_agent_port_list, port_list) + if (entry-device == device entry-port_num == port_num) return entry; - } + return NULL; } @@ -175,6 +176,9 @@ int ib_agent_port_open(struct ib_device *device, int port_num) goto error3; } + port_priv-device = device; + port_priv-port_num = port_num; + spin_lock_irqsave(ib_agent_port_list_lock, flags); list_add_tail(port_priv-port_list, ib_agent_port_list); spin_unlock_irqrestore(ib_agent_port_list_lock, flags); diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index 7b737c4..de83c71 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -199,6 +199,16 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, unsigned long flags; u8 mgmt_class, vclass; + /* Validate device and port */ + port_priv = ib_get_mad_port(device, port_num); + if (!port_priv) { + ret = ERR_PTR(-ENODEV); + goto error1; + } + + if (!port_priv-qp_info[qp_type].qp) + return NULL; + /* Validate parameters */ qpn = get_spl_qp_index(qp_type); if (qpn == -1) @@ -260,13 +270,6 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, goto error1; } - /* Validate device and port */ - port_priv = ib_get_mad_port(device, port_num); - if (!port_priv) { - ret = ERR_PTR(-ENODEV); - goto error1; - } - /* Allocate structures */ mad_agent_priv = kzalloc(sizeof *mad_agent_priv, GFP_KERNEL); if (!mad_agent_priv) { @@ -556,6 +559,9 @@ int ib_unregister_mad_agent(struct ib_mad_agent *mad_agent) struct ib_mad_agent_private *mad_agent_priv; struct ib_mad_snoop_private *mad_snoop_priv; + if (!mad_agent) + return 0; + /* If the TID is zero, the agent can only snoop. */ if (mad_agent-hi_tid) { mad_agent_priv = container_of(mad_agent, @@ -2602,6 +2608,9 @@ static void cleanup_recv_queue(struct ib_mad_qp_info *qp_info) struct ib_mad_private *recv; struct ib_mad_list_head *mad_list; + if (!qp_info-qp) + return; + while (!list_empty(qp_info-recv_queue.list)) { mad_list = list_entry(qp_info-recv_queue.list.next, @@ -2643,6 +2652,9 @@ static int ib_mad_port_start(struct ib_mad_port_private *port_priv) for (i = 0; i IB_MAD_QPS_CORE; i++) { qp = port_priv-qp_info[i].qp; + if (!qp) + continue; + /* * PKey index for QP1 is irrelevant but * one is needed for the Reset to Init transition @@ -2684,6 +2696,9 @@ static int ib_mad_port_start(struct ib_mad_port_private *port_priv) } for (i = 0; i IB_MAD_QPS_CORE; i++) { + if (!port_priv-qp_info[i].qp) + continue; + ret = ib_mad_post_receive_mads(port_priv-qp_info[i], NULL); if (ret) { printk(KERN_ERR PFX Couldn't post receive WRs\n); @@ -2762,6 +2777,9 @@ error: static void destroy_mad_qp(struct ib_mad_qp_info *qp_info) { + if (!qp_info-qp) + return; + ib_destroy_qp(qp_info-qp); kfree(qp_info-snoop_table); } @@ -2777,6 +2795,7 @@ static int ib_mad_port_open(struct ib_device *device, struct ib_mad_port_private *port_priv; unsigned long flags; char name[sizeof ib_mad123]; + int has_smi; /* Create new device info */ port_priv = kzalloc(sizeof
[ofa-general] [PATCHv4 05/10] ib/cm: Enable CM support for RDMAoE
CM messages can be transported on RDMAoE protocol ports so they are enabled here. Signed-off-by: Eli Cohen e...@mellanox.co.il --- drivers/infiniband/core/cm.c |2 +- drivers/infiniband/core/ucm.c | 12 +--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index f930f1d..63d6de3 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -3699,7 +3699,7 @@ static void cm_add_one(struct ib_device *ib_device) set_bit(IB_MGMT_METHOD_SEND, reg_req.method_mask); for (i = 1; i = ib_device-phys_port_cnt; i++) { tt = rdma_port_get_transport(ib_device, i); - if (tt != RDMA_TRANSPORT_IB) + if (tt != RDMA_TRANSPORT_IB tt != RDMA_TRANSPORT_RDMAOE) continue; port = kzalloc(sizeof *port, GFP_KERNEL); diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c index 4f5096d..21c78f5 100644 --- a/drivers/infiniband/core/ucm.c +++ b/drivers/infiniband/core/ucm.c @@ -1240,13 +1240,19 @@ static void ib_ucm_add_one(struct ib_device *device) { struct ib_ucm_device *ucm_dev; int i; + enum rdma_transport_type tt; if (!device-alloc_ucontext || device-node_type == RDMA_NODE_IB_SWITCH) return; - for (i = 1; i = device-phys_port_cnt; ++i) - if (rdma_port_get_transport(device, i) != RDMA_TRANSPORT_IB) - return; + for (i = 1; i = device-phys_port_cnt; ++i) { + tt = rdma_port_get_transport(device, i); + if (tt == RDMA_TRANSPORT_IB || tt == RDMA_TRANSPORT_RDMAOE) + break; + } + + if (i device-phys_port_cnt) + return; ucm_dev = kzalloc(sizeof *ucm_dev, GFP_KERNEL); if (!ucm_dev) -- 1.6.3.3 ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[ofa-general] [PATCHv4 06/10] ib_core: CMA device binding
Add support for RDMAoE device binding and IP -- GID resolution. Path resolving and multicast joining are implemented within cma.c by filling the responses and pushing the callbacks to the cma work queue. IP-GID resolution always yield IPv6 link local addresses - remote GIDs are derived from the destination MAC address of the remote port. Multicast GIDs are always mapped to broadcast MAC (all FFs). Some helper functions are added to ib_addr.h. Signed-off-by: Eli Cohen e...@mellanox.co.il --- drivers/infiniband/core/cma.c | 150 ++- drivers/infiniband/core/ucma.c | 25 +-- include/rdma/ib_addr.h | 87 +++ 3 files changed, 251 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 866ff7f..8f5675b 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -58,6 +58,7 @@ MODULE_LICENSE(Dual BSD/GPL); #define CMA_CM_RESPONSE_TIMEOUT 20 #define CMA_MAX_CM_RETRIES 15 #define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24) +#define RDMAOE_PACKET_LIFETIME 18 static void cma_add_one(struct ib_device *device); static void cma_remove_one(struct ib_device *device); @@ -174,6 +175,12 @@ struct cma_ndev_work { struct rdma_cm_eventevent; }; +struct rdmaoe_mcast_work { + struct work_struct work; + struct rdma_id_private *id; + struct cma_multicast*mc; +}; + union cma_ip_addr { struct in6_addr ip6; struct { @@ -348,6 +355,9 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv) case RDMA_TRANSPORT_IWARP: iw_addr_get_sgid(dev_addr, gid); break; + case RDMA_TRANSPORT_RDMAOE: + rdmaoe_addr_get_sgid(dev_addr, gid); + break; default: return -ENODEV; } @@ -576,10 +586,16 @@ static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv, { struct rdma_dev_addr *dev_addr = id_priv-id.route.addr.dev_addr; int ret; + u16 pkey; + +if (rdma_port_get_transport(id_priv-id.device, id_priv-id.port_num) == + RDMA_TRANSPORT_IB) + pkey = ib_addr_get_pkey(dev_addr); + else + pkey = 0x; ret = ib_find_cached_pkey(id_priv-id.device, id_priv-id.port_num, - ib_addr_get_pkey(dev_addr), - qp_attr-pkey_index); + pkey, qp_attr-pkey_index); if (ret) return ret; @@ -609,6 +625,7 @@ int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr, id_priv = container_of(id, struct rdma_id_private, id); switch (rdma_port_get_transport(id_priv-id.device, id_priv-id.port_num)) { case RDMA_TRANSPORT_IB: + case RDMA_TRANSPORT_RDMAOE: if (!id_priv-cm_id.ib || cma_is_ud_ps(id_priv-id.ps)) ret = cma_ib_init_qp_attr(id_priv, qp_attr, qp_attr_mask); else @@ -836,7 +853,9 @@ static void cma_leave_mc_groups(struct rdma_id_private *id_priv) mc = container_of(id_priv-mc_list.next, struct cma_multicast, list); list_del(mc-list); - ib_sa_free_multicast(mc-multicast.ib); + if (rdma_port_get_transport(id_priv-cma_dev-device, id_priv-id.port_num) == + RDMA_TRANSPORT_IB) + ib_sa_free_multicast(mc-multicast.ib); kref_put(mc-mcref, release_mc); } } @@ -855,6 +874,7 @@ void rdma_destroy_id(struct rdma_cm_id *id) mutex_unlock(lock); switch (rdma_port_get_transport(id_priv-id.device, id_priv-id.port_num)) { case RDMA_TRANSPORT_IB: + case RDMA_TRANSPORT_RDMAOE: if (id_priv-cm_id.ib !IS_ERR(id_priv-cm_id.ib)) ib_destroy_cm_id(id_priv-cm_id.ib); break; @@ -1512,6 +1532,7 @@ int rdma_listen(struct rdma_cm_id *id, int backlog) if (id-device) { switch (rdma_port_get_transport(id-device, id-port_num)) { case RDMA_TRANSPORT_IB: + case RDMA_TRANSPORT_RDMAOE: ret = cma_ib_listen(id_priv); if (ret) goto err; @@ -1727,6 +1748,65 @@ static int cma_resolve_iw_route(struct rdma_id_private *id_priv, int timeout_ms) return 0; } +static int cma_resolve_rdmaoe_route(struct rdma_id_private *id_priv) +{ + struct rdma_route *route = id_priv-id.route; + struct rdma_addr *addr = route-addr; + struct cma_work *work; + int ret; + struct sockaddr_in
[ofa-general] [PATCHv4 07/10] ib_core: RDMAoE UD packet packing support
Add support functions to aid in packing RDMAoE packets. Signed-off-by: Eli Cohen e...@mellanox.co.il --- drivers/infiniband/core/ud_header.c | 111 +++ include/rdma/ib_pack.h | 26 2 files changed, 137 insertions(+), 0 deletions(-) diff --git a/drivers/infiniband/core/ud_header.c b/drivers/infiniband/core/ud_header.c index 8ec7876..d04b6f2 100644 --- a/drivers/infiniband/core/ud_header.c +++ b/drivers/infiniband/core/ud_header.c @@ -80,6 +80,29 @@ static const struct ib_field lrh_table[] = { .size_bits= 16 } }; +static const struct ib_field eth_table[] = { + { STRUCT_FIELD(eth, dmac_h), + .offset_words = 0, + .offset_bits = 0, + .size_bits= 32 }, + { STRUCT_FIELD(eth, dmac_l), + .offset_words = 1, + .offset_bits = 0, + .size_bits= 16 }, + { STRUCT_FIELD(eth, smac_h), + .offset_words = 1, + .offset_bits = 16, + .size_bits= 16 }, + { STRUCT_FIELD(eth, smac_l), + .offset_words = 2, + .offset_bits = 0, + .size_bits= 32 }, + { STRUCT_FIELD(eth, type), + .offset_words = 3, + .offset_bits = 0, + .size_bits= 16 } +}; + static const struct ib_field grh_table[] = { { STRUCT_FIELD(grh, ip_version), .offset_words = 0, @@ -241,6 +264,53 @@ void ib_ud_header_init(int payload_bytes, EXPORT_SYMBOL(ib_ud_header_init); /** + * ib_rdmaoe_ud_header_init - Initialize UD header structure + * @payload_bytes:Length of packet payload + * @grh_present:GRH flag (if non-zero, GRH will be included) + * @header:Structure to initialize + * + * ib_rdmaoe_ud_header_init() initializes the grh.ip_version, grh.payload_length, + * grh.next_header, bth.opcode, bth.pad_count and + * bth.transport_header_version fields of a struct eth_ud_header given + * the payload length and whether a GRH will be included. + */ +void ib_rdmaoe_ud_header_init(int payload_bytes, + int grh_present, + struct eth_ud_header*header) +{ + int header_len; + + memset(header, 0, sizeof *header); + + header_len = + sizeof header-eth + + IB_BTH_BYTES + + IB_DETH_BYTES; + if (grh_present) + header_len += IB_GRH_BYTES; + + header-grh_present = grh_present; + if (grh_present) { + header-grh.ip_version = 6; + header-grh.payload_length = + cpu_to_be16((IB_BTH_BYTES + +IB_DETH_BYTES+ +payload_bytes+ +4+ /* ICRC */ +3) ~3); /* round up */ + header-grh.next_header = 0x1b; + } + + if (header-immediate_present) + header-bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE; + else + header-bth.opcode = IB_OPCODE_UD_SEND_ONLY; + header-bth.pad_count= (4 - payload_bytes) 3; + header-bth.transport_header_version = 0; +} +EXPORT_SYMBOL(ib_rdmaoe_ud_header_init); + +/** * ib_ud_header_pack - Pack UD header struct into wire format * @header:UD header struct * @buf:Buffer to pack into @@ -281,6 +351,47 @@ int ib_ud_header_pack(struct ib_ud_header *header, EXPORT_SYMBOL(ib_ud_header_pack); /** + * rdmaoe_ud_header_pack - Pack UD header struct into eth wire format + * @header:UD header struct + * @buf:Buffer to pack into + * + * ib_ud_header_pack() packs the UD header structure @header into wire + * format in the buffer @buf. + */ +int rdmaoe_ud_header_pack(struct eth_ud_header *header, + void *buf) +{ + int len = 0; + + ib_pack(eth_table, ARRAY_SIZE(eth_table), + header-eth, buf); + len += IB_ETH_BYTES; + + if (header-grh_present) { + ib_pack(grh_table, ARRAY_SIZE(grh_table), + header-grh, buf + len); + len += IB_GRH_BYTES; + } + + ib_pack(bth_table, ARRAY_SIZE(bth_table), + header-bth, buf + len); + len += IB_BTH_BYTES; + + ib_pack(deth_table, ARRAY_SIZE(deth_table), + header-deth, buf + len); + len += IB_DETH_BYTES; + + if (header-immediate_present) { + memcpy(buf + len, header-immediate_data, + sizeof header-immediate_data); + len += sizeof header-immediate_data; + } + + return len; +} +EXPORT_SYMBOL(rdmaoe_ud_header_pack); + +/** * ib_ud_header_unpack - Unpack UD header struct from wire format * @header:UD header struct * @buf:Buffer to pack into diff --git a/include/rdma/ib_pack.h b/include
[ofa-general] [PATCHv4 08/10] ib_core: Add API to support RDMAoE from userspace
Add ib_uverbs_get_mac() to be used by ibv_create_ah() to retirieve the remore port's MAC address. Port transport is also returned by ibv_query_port(). ABI version is incremented from 6 to 7. Signed-off-by: Eli Cohen e...@mellanox.co.il --- drivers/infiniband/core/uverbs.h |1 + drivers/infiniband/core/uverbs_cmd.c | 32 drivers/infiniband/core/uverbs_main.c |1 + drivers/infiniband/core/verbs.c | 10 ++ include/rdma/ib_user_verbs.h | 21 ++--- include/rdma/ib_verbs.h | 12 6 files changed, 74 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index b3ea958..e69b04c 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -194,5 +194,6 @@ IB_UVERBS_DECLARE_CMD(create_srq); IB_UVERBS_DECLARE_CMD(modify_srq); IB_UVERBS_DECLARE_CMD(query_srq); IB_UVERBS_DECLARE_CMD(destroy_srq); +IB_UVERBS_DECLARE_CMD(get_mac); #endif /* UVERBS_H */ diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 56feab6..012aadf 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -452,6 +452,7 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file, resp.active_width= attr.active_width; resp.active_speed= attr.active_speed; resp.phys_state = attr.phys_state; + resp.transport = attr.transport; if (copy_to_user((void __user *) (unsigned long) cmd.response, resp, sizeof resp)) @@ -1824,6 +1825,37 @@ err: return ret; } +ssize_t ib_uverbs_get_mac(struct ib_uverbs_file *file, const char __user *buf, + int in_len, int out_len) +{ + struct ib_uverbs_get_maccmd; + struct ib_uverbs_get_mac_resp resp; + int ret; + struct ib_pd*pd; + + if (out_len sizeof resp) + return -ENOSPC; + + if (copy_from_user(cmd, buf, sizeof cmd)) + return -EFAULT; + + pd = idr_read_pd(cmd.pd_handle, file-ucontext); + if (!pd) + return -EINVAL; + + ret = ib_get_mac(pd-device, cmd.port, cmd.gid, resp.mac); + put_pd_read(pd); + if (!ret) { + if (copy_to_user((void __user *) (unsigned long) cmd.response, +resp, sizeof resp)) + return -EFAULT; + + return in_len; + } + + return ret; +} + ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) { diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index eb36a81..2641845 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -108,6 +108,7 @@ static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file, [IB_USER_VERBS_CMD_MODIFY_SRQ] = ib_uverbs_modify_srq, [IB_USER_VERBS_CMD_QUERY_SRQ] = ib_uverbs_query_srq, [IB_USER_VERBS_CMD_DESTROY_SRQ] = ib_uverbs_destroy_srq, + [IB_USER_VERBS_CMD_GET_MAC] = ib_uverbs_get_mac, }; static struct vfsmount *uverbs_event_mnt; diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 3b2f00b..7cce5d6 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -912,3 +912,13 @@ int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid) return qp-device-detach_mcast(qp, gid, lid); } EXPORT_SYMBOL(ib_detach_mcast); + +int ib_get_mac(struct ib_device *device, u8 port, u8 *gid, u8 *mac) +{ + if (!device-get_mac) + return -ENOSYS; + + return device-get_mac(device, port, gid, mac); +} +EXPORT_SYMBOL(ib_get_mac); + diff --git a/include/rdma/ib_user_verbs.h b/include/rdma/ib_user_verbs.h index a17f771..49eee8a 100644 --- a/include/rdma/ib_user_verbs.h +++ b/include/rdma/ib_user_verbs.h @@ -42,7 +42,7 @@ * Increment this value if any changes that break userspace ABI * compatibility are made. */ -#define IB_USER_VERBS_ABI_VERSION 6 +#define IB_USER_VERBS_ABI_VERSION 7 enum { IB_USER_VERBS_CMD_GET_CONTEXT, @@ -81,7 +81,8 @@ enum { IB_USER_VERBS_CMD_MODIFY_SRQ, IB_USER_VERBS_CMD_QUERY_SRQ, IB_USER_VERBS_CMD_DESTROY_SRQ, - IB_USER_VERBS_CMD_POST_SRQ_RECV + IB_USER_VERBS_CMD_POST_SRQ_RECV, + IB_USER_VERBS_CMD_GET_MAC }; /* @@ -205,7 +206,8 @@ struct ib_uverbs_query_port_resp { __u8 active_width; __u8 active_speed; __u8 phys_state; - __u8 reserved[3]; + __u8 transport; + __u8 reserved[2]; }; struct ib_uverbs_alloc_pd { @@ -621,6 +623,19 @@ struct ib_uverbs_destroy_ah { __u32 ah_handle; }; +struct
[ofa-general] [PATCHv4 09/10] mlx4: Add support for RDMAoE - address resolution
The following path handles address vectors creation for RDMAoE ports. mlx4 needs the MAC address of the remote node to include it in the WQE of a UD QP or in the QP context of connected QPs. Address resolution is done atomically in the case of a link local address or a multicast GID and otherwise -EINVAL is returned. mlx4 transport packets were changed too to accomodate for RDMAoE. Signed-off-by: Eli Cohen e...@mellanox.co.il --- drivers/infiniband/hw/mlx4/ah.c | 187 -- drivers/infiniband/hw/mlx4/mlx4_ib.h | 19 +++- drivers/infiniband/hw/mlx4/qp.c | 172 +-- drivers/net/mlx4/fw.c|3 +- include/linux/mlx4/device.h | 31 ++- include/linux/mlx4/qp.h |8 +- 6 files changed, 327 insertions(+), 93 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c index c75ac94..0a015c3 100644 --- a/drivers/infiniband/hw/mlx4/ah.c +++ b/drivers/infiniband/hw/mlx4/ah.c @@ -31,63 +31,166 @@ */ #include mlx4_ib.h +#include rdma/ib_addr.h +#include linux/inet.h +#include linux/string.h -struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr) +int mlx4_ib_resolve_grh(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah_attr, + u8 *mac, int *is_mcast) { - struct mlx4_dev *dev = to_mdev(pd-device)-dev; - struct mlx4_ib_ah *ah; + struct mlx4_ib_rdmaoe *rdmaoe = dev-rdmaoe; + struct sockaddr_in6 s6 = {0}; + struct net_device *netdev; + int ifidx; - ah = kmalloc(sizeof *ah, GFP_ATOMIC); - if (!ah) - return ERR_PTR(-ENOMEM); + *is_mcast = 0; + spin_lock(rdmaoe-lock); + netdev = rdmaoe-netdevs[ah_attr-port_num - 1]; + if (!netdev) { + spin_unlock(rdmaoe-lock); + return -EINVAL; + } + ifidx = netdev-ifindex; + spin_unlock(rdmaoe-lock); - memset(ah-av, 0, sizeof ah-av); + memcpy(s6.sin6_addr.s6_addr, ah_attr-grh.dgid.raw, sizeof ah_attr-grh); + s6.sin6_family = AF_INET6; + s6.sin6_scope_id = ifidx; + if (rdma_link_local_addr(s6.sin6_addr)) + rdma_get_ll_mac(s6.sin6_addr, mac); + else if (rdma_is_multicast_addr(s6.sin6_addr)) { + rdma_get_mcast_mac(s6.sin6_addr, mac); + *is_mcast = 1; + } else + return -EINVAL; - ah-av.port_pd = cpu_to_be32(to_mpd(pd)-pdn | (ah_attr-port_num 24)); - ah-av.g_slid = ah_attr-src_path_bits; - ah-av.dlid= cpu_to_be16(ah_attr-dlid); - if (ah_attr-static_rate) { - ah-av.stat_rate = ah_attr-static_rate + MLX4_STAT_RATE_OFFSET; - while (ah-av.stat_rate IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET - !(1 ah-av.stat_rate dev-caps.stat_rate_support)) - --ah-av.stat_rate; - } - ah-av.sl_tclass_flowlabel = cpu_to_be32(ah_attr-sl 28); + return 0; +} + +static struct ib_ah *create_ib_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr, + struct mlx4_ib_ah *ah) +{ + struct mlx4_dev *dev = to_mdev(pd-device)-dev; + + ah-av.ib.port_pd = cpu_to_be32(to_mpd(pd)-pdn | (ah_attr-port_num 24)); + ah-av.ib.g_slid = ah_attr-src_path_bits; if (ah_attr-ah_flags IB_AH_GRH) { - ah-av.g_slid |= 0x80; - ah-av.gid_index = ah_attr-grh.sgid_index; - ah-av.hop_limit = ah_attr-grh.hop_limit; - ah-av.sl_tclass_flowlabel |= + ah-av.ib.g_slid |= 0x80; + ah-av.ib.gid_index = ah_attr-grh.sgid_index; + ah-av.ib.hop_limit = ah_attr-grh.hop_limit; + ah-av.ib.sl_tclass_flowlabel |= cpu_to_be32((ah_attr-grh.traffic_class 20) | ah_attr-grh.flow_label); - memcpy(ah-av.dgid, ah_attr-grh.dgid.raw, 16); + memcpy(ah-av.ib.dgid, ah_attr-grh.dgid.raw, 16); + } + + ah-av.ib.dlid= cpu_to_be16(ah_attr-dlid); + if (ah_attr-static_rate) { + ah-av.ib.stat_rate = ah_attr-static_rate + MLX4_STAT_RATE_OFFSET; + while (ah-av.ib.stat_rate IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET + !(1 ah-av.ib.stat_rate dev-caps.stat_rate_support)) + --ah-av.ib.stat_rate; } + ah-av.ib.sl_tclass_flowlabel = cpu_to_be32(ah_attr-sl 28); return ah-ibah; } +static struct ib_ah *create_rdmaoe_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr, + struct mlx4_ib_ah *ah) +{ + struct mlx4_ib_dev *ibdev = to_mdev(pd-device); + struct mlx4_dev *dev = ibdev-dev; + u8 mac[6]; + int err; + int is_mcast; + + err = mlx4_ib_resolve_grh(ibdev, ah_attr, mac, is_mcast); + if (err
[ofa-general] [PATCHv4 10/10] mlx4: Add RDMAoE support - allow interfaces to correspond to each other
This patch add support RDMAoE for mlx4. Since mlx4_ib now needs to reference mlx4_en netdevices, a new mechanism was added. Two new fields were added to struct mlx4_interface to define a protocol and a get_prot_dev method to retrieve the corresponding protocol's net device. An implementation of the new verb ib_get_port_link_type() - mlx4_ib_get_port_link_type - was added. mlx4_ib_query_port() has been modified to support eth link types. An interface is considered to be active if its corresponding eth interface is active. Code for setting the GID table of a port has been added. Currently, each IB port has a single GID entry in its table and that GID entery equals the link local IPv6 address. Signed-off-by: Eli Cohen e...@mellanox.co.il --- drivers/infiniband/hw/mlx4/main.c | 309 + drivers/net/mlx4/en_main.c| 15 ++- drivers/net/mlx4/en_port.c|4 +- drivers/net/mlx4/en_port.h|3 +- drivers/net/mlx4/intf.c | 20 +++ drivers/net/mlx4/main.c |6 + drivers/net/mlx4/mlx4.h |1 + include/linux/mlx4/cmd.h |1 + include/linux/mlx4/driver.h | 16 ++- 9 files changed, 335 insertions(+), 40 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index ae3d759..737c6b9 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -34,9 +34,12 @@ #include linux/module.h #include linux/init.h #include linux/errno.h +#include linux/netdevice.h +#include linux/inetdevice.h #include rdma/ib_smi.h #include rdma/ib_user_verbs.h +#include rdma/ib_addr.h #include linux/mlx4/driver.h #include linux/mlx4/cmd.h @@ -57,6 +60,15 @@ static const char mlx4_ib_version[] = DRV_NAME : Mellanox ConnectX InfiniBand driver v DRV_VERSION ( DRV_RELDATE )\n; +struct update_gid_work { + struct work_struct work; + union ib_gid gids[128]; + int port; + struct mlx4_ib_dev *dev; +}; + +static struct workqueue_struct *wq; + static void init_query_mad(struct ib_smp *mad) { mad-base_version = 1; @@ -152,28 +164,19 @@ out: return err; } -static int mlx4_ib_query_port(struct ib_device *ibdev, u8 port, - struct ib_port_attr *props) +static enum rdma_transport_type +mlx4_ib_port_get_transport(struct ib_device *device, u8 port_num) { - struct ib_smp *in_mad = NULL; - struct ib_smp *out_mad = NULL; - int err = -ENOMEM; - - in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); - out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); - if (!in_mad || !out_mad) - goto out; - - memset(props, 0, sizeof *props); - - init_query_mad(in_mad); - in_mad-attr_id = IB_SMP_ATTR_PORT_INFO; - in_mad-attr_mod = cpu_to_be32(port); + struct mlx4_dev *dev = to_mdev(device)-dev; - err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad); - if (err) - goto out; + return dev-caps.port_mask (1 (port_num - 1)) ? + RDMA_TRANSPORT_IB : RDMA_TRANSPORT_RDMAOE; +} +static void ib_link_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props, + struct ib_smp *out_mad) +{ props-lid = be16_to_cpup((__be16 *) (out_mad-data + 16)); props-lmc = out_mad-data[34] 0x7; props-sm_lid = be16_to_cpup((__be16 *) (out_mad-data + 18)); @@ -193,6 +196,67 @@ static int mlx4_ib_query_port(struct ib_device *ibdev, u8 port, props-subnet_timeout = out_mad-data[51] 0x1f; props-max_vl_num = out_mad-data[37] 4; props-init_type_reply = out_mad-data[41] 4; + props-transport= RDMA_TRANSPORT_IB; +} + +static void eth_link_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props, + struct ib_smp *out_mad) +{ + struct mlx4_ib_rdmaoe *rdmaoe = to_mdev(ibdev)-rdmaoe; + struct net_device *ndev; + + props-port_cap_flags = IB_PORT_CM_SUP; + props-gid_tbl_len = to_mdev(ibdev)-dev-caps.gid_table_len[port]; + props-max_msg_sz = to_mdev(ibdev)-dev-caps.max_msg_sz; + props-pkey_tbl_len = 1; + props-bad_pkey_cntr= be16_to_cpup((__be16 *) (out_mad-data + 46)); + props-qkey_viol_cntr = be16_to_cpup((__be16 *) (out_mad-data + 48)); + props-active_width = 0; + props-active_speed = 0; + props-max_mtu = out_mad-data[41] 0xf; + props-subnet_timeout = 0; + props-max_vl_num = out_mad-data[37] 4; + props-init_type_reply = 0; + props-transport= RDMA_TRANSPORT_RDMAOE; + spin_lock(rdmaoe-lock); + ndev = rdmaoe-netdevs[port - 1]; + if (!ndev) + goto out
[ofa-general] [PATCHv4] libibverbs: Add RDMAoE support
Extend the ibv_query_port() verb to return a port transport protocol which can be one of RDMA_TRANSPORT_IB, RDMA_TRANSPORT_IWARP or RDMA_TRANSPORT_RDMAOE. This can be used by applications to know if they must use GRH as is the case in RDMAoE. Add a new system call to get the MAC address of the remote port that a UD address vector refers to. Update ibv_rc_pingpong and ibv_ud_pingpong to accept a remote GID so that they can be used with an RDMAoE port. Signed-off-by: Eli Cohen e...@mellanox.co.il --- Changed the reference to a port from link type to protocol type. This patch is tagged v4 to create correspondence with the kernel patches. examples/devinfo.c| 15 examples/pingpong.c |9 +++ examples/pingpong.h |2 + examples/rc_pingpong.c| 50 examples/ud_pingpong.c| 38 +++ include/infiniband/driver.h |1 + include/infiniband/kern-abi.h | 25 ++-- include/infiniband/verbs.h| 12 + src/cmd.c | 20 src/libibverbs.map|1 + 10 files changed, 155 insertions(+), 18 deletions(-) diff --git a/examples/devinfo.c b/examples/devinfo.c index caa5d5f..a42a6dc 100644 --- a/examples/devinfo.c +++ b/examples/devinfo.c @@ -175,6 +175,20 @@ static int print_all_port_gids(struct ibv_context *ctx, uint8_t port_num, int tb return rc; } +static const char *transport_type_str(enum rdma_transport_type type) +{ + switch (type) { + case RDMA_TRANSPORT_IB: + return IB; + case RDMA_TRANSPORT_IWARP: + return IWARP; + case RDMA_TRANSPORT_RDMAOE: + return RDMAOE; + default: + return Unknown; + } +} + static int print_hca_cap(struct ibv_device *ib_dev, uint8_t ib_port) { struct ibv_context *ctx; @@ -273,6 +287,7 @@ static int print_hca_cap(struct ibv_device *ib_dev, uint8_t ib_port) printf(\t\t\tsm_lid:\t\t\t%d\n, port_attr.sm_lid); printf(\t\t\tport_lid:\t\t%d\n, port_attr.lid); printf(\t\t\tport_lmc:\t\t0x%02x\n, port_attr.lmc); + printf(\t\t\ttrasnport_type:\t\t%s\n, transport_type_str(port_attr.transport)); if (verbose) { printf(\t\t\tmax_msg_sz:\t\t0x%x\n, port_attr.max_msg_sz); diff --git a/examples/pingpong.c b/examples/pingpong.c index b916f59..d4a46e4 100644 --- a/examples/pingpong.c +++ b/examples/pingpong.c @@ -31,6 +31,8 @@ */ #include pingpong.h +#include arpa/inet.h +#include stdlib.h enum ibv_mtu pp_mtu_to_enum(int mtu) { @@ -53,3 +55,10 @@ uint16_t pp_get_local_lid(struct ibv_context *context, int port) return attr.lid; } + +int pp_get_port_info(struct ibv_context *context, int port, +struct ibv_port_attr *attr) +{ + return ibv_query_port(context, port, attr); +} + diff --git a/examples/pingpong.h b/examples/pingpong.h index 71d7c3f..16d3466 100644 --- a/examples/pingpong.h +++ b/examples/pingpong.h @@ -37,5 +37,7 @@ enum ibv_mtu pp_mtu_to_enum(int mtu); uint16_t pp_get_local_lid(struct ibv_context *context, int port); +int pp_get_port_info(struct ibv_context *context, int port, +struct ibv_port_attr *attr); #endif /* IBV_PINGPONG_H */ diff --git a/examples/rc_pingpong.c b/examples/rc_pingpong.c index 26fa45c..4250cdf 100644 --- a/examples/rc_pingpong.c +++ b/examples/rc_pingpong.c @@ -67,6 +67,8 @@ struct pingpong_context { int size; int rx_depth; int pending; + struct ibv_port_attr portinfo; + union ibv_giddgid; }; struct pingpong_dest { @@ -94,6 +96,12 @@ static int pp_connect_ctx(struct pingpong_context *ctx, int port, int my_psn, .port_num = port } }; + + if (ctx-dgid.global.interface_id) { + attr.ah_attr.is_global = 1; + attr.ah_attr.grh.hop_limit = 1; + attr.ah_attr.grh.dgid = ctx-dgid; + } if (ibv_modify_qp(ctx-qp, attr, IBV_QP_STATE | IBV_QP_AV | @@ -289,11 +297,11 @@ out: static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size, int rx_depth, int port, - int use_event) + int use_event, int is_server) { struct pingpong_context *ctx; - ctx = malloc(sizeof *ctx); + ctx = calloc(1, sizeof *ctx); if (!ctx) return NULL; @@ -306,7 +314,7 @@ static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size, return NULL; } - memset
[ofa-general] [PATCHv4] libmlx4: Add RDMAoE support
Modify mlx4_create_ah() to check the port's transport protocol, and for the case of RDMAoE ports, do a system call to retrieve the remote port's MAC address. Make modifications to address vector data structs and code to accomodate for RDMAoE. --- Changed the reference to a port from link type to protocol type. This patch is tagged v4 to create correspondence with the kernel patches. src/mlx4.h |3 +++ src/qp.c|2 ++ src/verbs.c | 29 + src/wqe.h |3 ++- 4 files changed, 36 insertions(+), 1 deletions(-) diff --git a/src/mlx4.h b/src/mlx4.h index 827a201..20d3fdd 100644 --- a/src/mlx4.h +++ b/src/mlx4.h @@ -236,11 +236,14 @@ struct mlx4_av { uint8_t hop_limit; uint32_tsl_tclass_flowlabel; uint8_t dgid[16]; + uint8_t mac[8]; }; struct mlx4_ah { struct ibv_ah ibv_ah; struct mlx4_av av; + uint16_tvlan; + uint8_t mac[6]; }; static inline unsigned long align(unsigned long val, unsigned long align) diff --git a/src/qp.c b/src/qp.c index d194ae3..cd8fab0 100644 --- a/src/qp.c +++ b/src/qp.c @@ -143,6 +143,8 @@ static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg, memcpy(dseg-av, to_mah(wr-wr.ud.ah)-av, sizeof (struct mlx4_av)); dseg-dqpn = htonl(wr-wr.ud.remote_qpn); dseg-qkey = htonl(wr-wr.ud.remote_qkey); + dseg-vlan = htons(to_mah(wr-wr.ud.ah)-vlan); + memcpy(dseg-mac, to_mah(wr-wr.ud.ah)-mac, 6); } static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg) diff --git a/src/verbs.c b/src/verbs.c index cc179a0..e60ab05 100644 --- a/src/verbs.c +++ b/src/verbs.c @@ -614,9 +614,21 @@ int mlx4_destroy_qp(struct ibv_qp *ibqp) return 0; } +static int mcast_mac(uint8_t *mac) +{ + int i; + uint8_t val = 0xff; + + for (i = 0; i 6; ++i) + val = mac[i]; + + return val == 0xff; +} + struct ibv_ah *mlx4_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr) { struct mlx4_ah *ah; + struct ibv_port_attr port_attr; ah = malloc(sizeof *ah); if (!ah) @@ -642,7 +654,24 @@ struct ibv_ah *mlx4_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr) memcpy(ah-av.dgid, attr-grh.dgid.raw, 16); } + if (ibv_query_port(pd-context, attr-port_num, port_attr)) + goto err; + + if (port_attr.transport == RDMA_TRANSPORT_RDMAOE) { + if (ibv_cmd_get_mac(pd, attr-port_num, ah-av.dgid, ah-mac)) + goto err; + + ah-vlan = 0; + if (mcast_mac(ah-mac)) + ah-av.dlid = htons(0xc000); + + } + + return ah-ibv_ah; +err: + free(ah); + return NULL; } int mlx4_destroy_ah(struct ibv_ah *ah) diff --git a/src/wqe.h b/src/wqe.h index 6f7f309..ea6f27f 100644 --- a/src/wqe.h +++ b/src/wqe.h @@ -78,7 +78,8 @@ struct mlx4_wqe_datagram_seg { uint32_tav[8]; uint32_tdqpn; uint32_tqkey; - uint32_treserved[2]; + __be16 vlan; + uint8_t mac[6]; }; struct mlx4_wqe_data_seg { -- 1.6.3.3 ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
Re: [ofa-general] [PATCH] cma: fix access to freed memory
On Wed, Aug 05, 2009 at 08:46:43AM -0700, Sean Hefty wrote: rdma_destroy_id and rdma_leave_multicast call ib_sa_free_multicast. This call will block until the join callback completes or is canceled. Can you describe the race with cma_ib_mc_handler in more detail? That explains it. I was using a different join implementation for RDMAoE without a leave operation so I had to use this kref solution. So no need to this patch. I will provide a distinct solution for the RDMAoE case. Thanks. ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[ofa-general] [PATCH v2] cma: fix access to freed memory
rdma_join_multicast() allocates struct cma_multicast and then proceeds to join to a multicast address. However, the join operation completes in another context and the allocated struct could be released if the user destroys either the rdma_id object or decides to leave the multicast group while the join operation is in progress. This patch uses a kref object to maintain reference counting to avoid such situation. Signed-off-by: Eli Cohen e...@mellanox.co.il --- Changes from previous version: I removed the protection of mc list manipulation using spinlocks becuase - a. In order to break into different patches b. I have doubts as for the necessity of this protection. drivers/infiniband/core/cma.c | 20 1 files changed, 16 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 851de83..aa62101 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -157,6 +157,7 @@ struct cma_multicast { struct list_headlist; void*context; struct sockaddr_storage addr; + struct kref mcref; }; struct cma_work { @@ -290,6 +291,13 @@ static inline void cma_deref_dev(struct cma_device *cma_dev) complete(cma_dev-comp); } +void release_mc(struct kref *kref) +{ + struct cma_multicast *mc = container_of(kref, struct cma_multicast, mcref); + + kfree(mc); +} + static void cma_detach_from_dev(struct rdma_id_private *id_priv) { list_del(id_priv-list); @@ -827,7 +835,7 @@ static void cma_leave_mc_groups(struct rdma_id_private *id_priv) struct cma_multicast, list); list_del(mc-list); ib_sa_free_multicast(mc-multicast.ib); - kfree(mc); + kref_put(mc-mcref, release_mc); } } @@ -2643,7 +2651,7 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) id_priv = mc-id_priv; if (cma_disable_callback(id_priv, CMA_ADDR_BOUND) cma_disable_callback(id_priv, CMA_ADDR_RESOLVED)) - return 0; + goto out; mutex_lock(id_priv-qp_mutex); if (!status id_priv-id.qp) @@ -2669,10 +2677,12 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) cma_exch(id_priv, CMA_DESTROYING); mutex_unlock(id_priv-handler_mutex); rdma_destroy_id(id_priv-id); - return 0; + goto out; } mutex_unlock(id_priv-handler_mutex); +out: + kref_put(mc-mcref, release_mc); return 0; } @@ -2759,11 +2769,13 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, memcpy(mc-addr, addr, ip_addr_size(addr)); mc-context = context; mc-id_priv = id_priv; + kref_init(mc-mcref); spin_lock(id_priv-lock); list_add(mc-list, id_priv-mc_list); spin_unlock(id_priv-lock); + kref_get(mc-mcref); switch (rdma_node_get_transport(id-device-node_type)) { case RDMA_TRANSPORT_IB: ret = cma_join_ib_multicast(id_priv, mc); @@ -2800,7 +2812,7 @@ void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr) mc-multicast.ib-rec.mgid, mc-multicast.ib-rec.mlid); ib_sa_free_multicast(mc-multicast.ib); - kfree(mc); + kref_put(mc-mcref, release_mc); return; } } -- 1.6.3.3 ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
Re: [ofa-general] Re: [PATCH] cma: fix access to freed memory
On Tue, Aug 04, 2009 at 09:05:13AM -0700, Roland Dreier wrote: Maybe it's just a loose connection but yet, it seems to me that operations on id_priv-mc_list should be protected. Should I send a different patch? seems ... should be is very weak justification for locking. What should they be protected from? What if rdma_join_multicast() is called when rdma_destroy_id() - for example from cma_ib_handler() due to error returned from the handler? In this case list_add(mc-list, id_priv-mc_list) in rdma_join_multicast() can may be executed along with the list manipulation done in cma_leave_mc_groups(). Generally, it looks strange that in some places list handling is protected with a spinlock and in other places not. ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[ofa-general] [PATCH] cma: fix access to freed memory
rdma_join_multicast() allocates struct cma_multicast and then proceeds to join to a multicast address. However, the join operation completes in another context and the allocated struct could be released if the user destroys either the rdma_id object or decides to leave the multicast group while the join is in progress. This patch uses reference counting to to avoid such situation. It also protects removal from id_priv-mc_list in cma_leave_mc_groups(). Signed-off-by: Eli Cohen e...@mellanox.co.il --- drivers/infiniband/core/cma.c | 23 +++ 1 files changed, 19 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 851de83..8fee477 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -157,6 +157,7 @@ struct cma_multicast { struct list_headlist; void*context; struct sockaddr_storage addr; + atomic_trefcount; }; struct cma_work { @@ -290,6 +291,12 @@ static inline void cma_deref_dev(struct cma_device *cma_dev) complete(cma_dev-comp); } +void cma_deref_mc(struct cma_multicast *mc) +{ + if (atomic_dec_and_test(mc-refcount)) + kfree(mc); +} + static void cma_detach_from_dev(struct rdma_id_private *id_priv) { list_del(id_priv-list); @@ -822,13 +829,17 @@ static void cma_leave_mc_groups(struct rdma_id_private *id_priv) { struct cma_multicast *mc; + spin_lock_irq(id_priv-lock); while (!list_empty(id_priv-mc_list)) { mc = container_of(id_priv-mc_list.next, struct cma_multicast, list); list_del(mc-list); + spin_unlock_irq(id_priv-lock); ib_sa_free_multicast(mc-multicast.ib); - kfree(mc); + cma_deref_mc(mc); + spin_lock_irq(id_priv-lock); } + spin_unlock_irq(id_priv-lock); } void rdma_destroy_id(struct rdma_cm_id *id) @@ -2643,7 +2654,7 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) id_priv = mc-id_priv; if (cma_disable_callback(id_priv, CMA_ADDR_BOUND) cma_disable_callback(id_priv, CMA_ADDR_RESOLVED)) - return 0; + goto out; mutex_lock(id_priv-qp_mutex); if (!status id_priv-id.qp) @@ -2669,10 +2680,12 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) cma_exch(id_priv, CMA_DESTROYING); mutex_unlock(id_priv-handler_mutex); rdma_destroy_id(id_priv-id); - return 0; + goto out; } mutex_unlock(id_priv-handler_mutex); +out: + cma_deref_mc(mc); return 0; } @@ -2759,11 +2772,13 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, memcpy(mc-addr, addr, ip_addr_size(addr)); mc-context = context; mc-id_priv = id_priv; + atomic_set(mc-refcount, 1); spin_lock(id_priv-lock); list_add(mc-list, id_priv-mc_list); spin_unlock(id_priv-lock); + atomic_inc(mc-refcount); switch (rdma_node_get_transport(id-device-node_type)) { case RDMA_TRANSPORT_IB: ret = cma_join_ib_multicast(id_priv, mc); @@ -2800,7 +2815,7 @@ void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr) mc-multicast.ib-rec.mgid, mc-multicast.ib-rec.mlid); ib_sa_free_multicast(mc-multicast.ib); - kfree(mc); + cma_deref_mc(mc); return; } } -- 1.6.3.3 ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[ofa-general] [PATCH] mlx4_core: map sufficient ICM memory for EQs
Current implementation allocates a single host page for EQ context memory. As the number of CPU cores increases, and since the number of required EQs depends on this number, this patch removes the hard coded limit and makes the allocation dependent on EQ entry size and the number of required EQs. Signed-off-by: Eli Cohen e...@mellanox.co.il --- drivers/net/mlx4/eq.c | 42 -- drivers/net/mlx4/main.c |1 + drivers/net/mlx4/mlx4.h |1 + include/linux/mlx4/device.h |1 + 4 files changed, 27 insertions(+), 18 deletions(-) diff --git a/drivers/net/mlx4/eq.c b/drivers/net/mlx4/eq.c index b9ceddd..1d41b1a 100644 --- a/drivers/net/mlx4/eq.c +++ b/drivers/net/mlx4/eq.c @@ -530,29 +530,34 @@ int mlx4_map_eq_icm(struct mlx4_dev *dev, u64 icm_virt) { struct mlx4_priv *priv = mlx4_priv(dev); int ret; + int host_pages, icm_pages; + int i; - /* -* We assume that mapping one page is enough for the whole EQ -* context table. This is fine with all current HCAs, because -* we only use 32 EQs and each EQ uses 64 bytes of context -* memory, or 1 KB total. -*/ + host_pages = ALIGN(min_t(int, dev-caps.num_eqs, num_possible_cpus() + 1) * + dev-caps.eqc_entry_size, PAGE_SIZE) PAGE_SHIFT; + priv-eq_table.order = ilog2(roundup_pow_of_two(host_pages)); priv-eq_table.icm_virt = icm_virt; - priv-eq_table.icm_page = alloc_page(GFP_HIGHUSER); + priv-eq_table.icm_page = alloc_pages(GFP_HIGHUSER, priv-eq_table.order); if (!priv-eq_table.icm_page) return -ENOMEM; priv-eq_table.icm_dma = pci_map_page(dev-pdev, priv-eq_table.icm_page, 0, - PAGE_SIZE, PCI_DMA_BIDIRECTIONAL); + PAGE_SIZE priv-eq_table.order, + PCI_DMA_BIDIRECTIONAL); if (pci_dma_mapping_error(dev-pdev, priv-eq_table.icm_dma)) { - __free_page(priv-eq_table.icm_page); + __free_pages(priv-eq_table.icm_page, priv-eq_table.order); return -ENOMEM; } - ret = mlx4_MAP_ICM_page(dev, priv-eq_table.icm_dma, icm_virt); - if (ret) { - pci_unmap_page(dev-pdev, priv-eq_table.icm_dma, PAGE_SIZE, - PCI_DMA_BIDIRECTIONAL); - __free_page(priv-eq_table.icm_page); + icm_pages = (PAGE_SIZE / MLX4_ICM_PAGE_SIZE) * (1 priv-eq_table.order); + for (i = 0; i icm_pages; ++i) { + ret = mlx4_MAP_ICM_page(dev, priv-eq_table.icm_dma, icm_virt + i * MLX4_ICM_PAGE_SIZE); + if (ret) { + mlx4_UNMAP_ICM(dev, priv-eq_table.icm_virt, i); + pci_unmap_page(dev-pdev, priv-eq_table.icm_dma, PAGE_SIZE, + PCI_DMA_BIDIRECTIONAL); + __free_pages(priv-eq_table.icm_page, priv-eq_table.order); + break; + } } return ret; @@ -561,11 +566,12 @@ int mlx4_map_eq_icm(struct mlx4_dev *dev, u64 icm_virt) void mlx4_unmap_eq_icm(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); + int icm_pages = (PAGE_SIZE / MLX4_ICM_PAGE_SIZE) * (1 priv-eq_table.order); - mlx4_UNMAP_ICM(dev, priv-eq_table.icm_virt, 1); - pci_unmap_page(dev-pdev, priv-eq_table.icm_dma, PAGE_SIZE, - PCI_DMA_BIDIRECTIONAL); - __free_page(priv-eq_table.icm_page); + mlx4_UNMAP_ICM(dev, priv-eq_table.icm_virt, icm_pages); + pci_unmap_page(dev-pdev, priv-eq_table.icm_dma, + PAGE_SIZE priv-eq_table.order, PCI_DMA_BIDIRECTIONAL); + __free_pages(priv-eq_table.icm_page, priv-eq_table.order); } int mlx4_alloc_eq_table(struct mlx4_dev *dev) diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c index dac621b..872becd 100644 --- a/drivers/net/mlx4/main.c +++ b/drivers/net/mlx4/main.c @@ -207,6 +207,7 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) dev-caps.max_cqes = dev_cap-max_cq_sz - 1; dev-caps.reserved_cqs = dev_cap-reserved_cqs; dev-caps.reserved_eqs = dev_cap-reserved_eqs; + dev-caps.eqc_entry_size = dev_cap-eqc_entry_sz; dev-caps.mtts_per_seg = 1 log_mtts_per_seg; dev-caps.reserved_mtts = DIV_ROUND_UP(dev_cap-reserved_mtts, dev-caps.mtts_per_seg); diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h index 5bd79c2..1a20fa3 100644 --- a/drivers/net/mlx4/mlx4.h +++ b/drivers/net/mlx4/mlx4.h @@ -211,6 +211,7 @@ struct mlx4_eq_table { struct mlx4_icm_table cmpt_table; int have_irq; u8 inta_pin; + int
Re: [ofa-general] Re: [ewg] [PATCH 1/8 v3] ib_core: Add API to support RDMAoE
On Mon, Jul 13, 2009 at 03:26:06PM -0400, Hal Rosenstock wrote: +enum ib_port_link_type ib_get_port_link_type(struct ib_device *device, u8 port_num) +{ + Â Â Â return device-get_port_link_type ? + Â Â Â Â Â Â Â device-get_port_link_type(device, port_num) : PORT_LINK_IB; So do iWARP devices return PORT_LINK_IB ? If so, that seems a little weird to me. -- Hal Maybe it's more appropriate to make this function mandatory and require all drivers to report the correct port type. What do you think? ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[ofa-general] Re: [ewg] [PATCH 2/8 v3] ib_core: RDMAoE support only QP1
On Mon, Jul 13, 2009 at 03:26:34PM -0400, Hal Rosenstock wrote: On Mon, Jul 13, 2009 at 2:14 PM, Eli Cohene...@mellanox.co.il wrote: Since RDMAoE is using Ethernet as its link layer, there is no need for QP0. QP1 is still needed since it handles communications between CM agents. This patch will create only QP1 for RDMAoE ports. What happens with other QP1 traffic (other than CM and SA) ? I think it should work but I haven't tried that. Userspace can access QP1 (and QP0). QP0 is not accessible since ib_register_mad_agent() will fail for QP0 becuase of this: if (!port_priv-qp_info[qp_type].qp) return NULL; QP1 should work in the same way Does QP0 error out ? What about QP1 ? Does it just timeout ? If so, a direct error would be better. See above - you can't access QP0. Do you know of a utility from userspace which sends/receives MADs on QP0 or QP1? ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[ofa-general] Re: [ewg] [PATCH 2/8 v3] ib_core: RDMAoE support only QP1
On Tue, Jul 14, 2009 at 07:15:44AM -0400, Hal Rosenstock wrote: On Tue, Jul 14, 2009 at 3:46 AM, Eli Cohene...@dev.mellanox.co.il wrote: On Mon, Jul 13, 2009 at 03:26:34PM -0400, Hal Rosenstock wrote: On Mon, Jul 13, 2009 at 2:14 PM, Eli Cohene...@mellanox.co.il wrote: Since RDMAoE is using Ethernet as its link layer, there is no need for QP0. QP1 is still needed since it handles communications between CM agents. This patch will create only QP1 for RDMAoE ports. What happens with other QP1 traffic (other than CM and SA) ? I think it should work but I haven't tried that. Would you ? You could try tools from infiniband-diags or ibdiagnet. Yes I would try that. But I need something that will not fail because it could not open QP0. For example, something that uses only QP1. Are the any in ibutils? Userspace can access QP1 (and QP0). QP0 is not accessible since ib_register_mad_agent() will fail for QP0 becuase of this: Â Â Â Â if (!port_priv-qp_info[qp_type].qp) Â Â Â Â Â Â Â Â return NULL; QP1 should work in the same way So what happens with things like PerfMgt class ? I think it ends up timing out if no receiver consumer is present. Does QP0 error out ? What about QP1 ? Does it just timeout ? If so, a direct error would be better. See above - you can't access QP0. Do you know of a utility from userspace which sends/receives MADs on QP0 or QP1? Yes, opensm, infiniband-diags (various), and ibutils (ibdiagnet, etc). -- Hal ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[ofa-general] [PATCH 0/8 v3] RDMAoE support
RDMA over Ethernet (RDMAoE) allows running the IB transport protocol using Ethernet frames allowing the deployment of IB semantics on lossless Ethernet fabrics. RDMAoE packets are standard Ethernet frames with an IEEE assigned Ethertype, a GRH, unmodified IB transport headers and payload. Aside from the considerations pointed out below, RDMAoE ports are functionally equivalent to regular IB ports from the RDMA stack perspective. IB subnet management and SA services are not required for RDMAoE operation; Ethernet management practices are used instead. In Ethernet, nodes are commonly referred to by applications by means of an IP address. RDMAoE encodes the IP addresses that were assigned to the corresponding Ethernet port into its GIDs, and makes use of the IP stack to bind a destination address to the corresponding netdevice (just as the CMA does today for IB and iWARP) and to obtain its L2 MAC addresses. The RDMA Verbs API is syntactically unmodified. When referring to RDMAoE ports, Address handles are required to contain GIDs and the L2 address fields in the API are ignored. The Ethernet L2 information is then obtained by the vendor-specific driver (both in kernel- and user-space) while modifying QPs to RTR and creating address handles. In order to maximize transparency for applications, RDMAoE implements a dedicated API that provides services equivalent to some of those provided by the IB-SA. The current approach is strictly local but may evolve in the future. This API is implemented using an independent source code file which allows for seamless evolution of the code without affecting the IB native SA interfaces. We have successfully tested MPI, SDP, RDS, and native Verbs applications over RDMAoE. To enable RDMAoE with the mlx4 driver stack, both the mlx4_en and mlx4_ib drivers must be loaded, and the netdevice for the corresponding RDMAoE port must be running. Individual ports of a multi port HCA can be independently configured as Ethernet (with support for RDMAoE) or IB, as is already the case. Following is a series of 8 patches based on version 2.6.30 of the Linux kernel. This new series reflects changes based on feedback from the community on the previous set of patches. The whole series is tagged v3. Signed-off-by: Eli Cohen e...@mellanox.co.il drivers/infiniband/core/Makefile |2 drivers/infiniband/core/addr.c| 20 drivers/infiniband/core/agent.c | 12 drivers/infiniband/core/cma.c | 124 +++ drivers/infiniband/core/mad.c | 48 + drivers/infiniband/core/multicast.c | 43 - drivers/infiniband/core/multicast.h | 79 ++ drivers/infiniband/core/rdmaoe_sa.c | 942 ++ drivers/infiniband/core/sa.h | 24 drivers/infiniband/core/sa_query.c| 26 drivers/infiniband/core/ud_header.c | 111 +++ drivers/infiniband/core/uverbs.h |1 drivers/infiniband/core/uverbs_cmd.c | 33 + drivers/infiniband/core/uverbs_main.c |1 drivers/infiniband/core/verbs.c | 17 drivers/infiniband/hw/mlx4/ah.c | 228 ++- drivers/infiniband/hw/mlx4/main.c | 276 +++- drivers/infiniband/hw/mlx4/mlx4_ib.h | 30 drivers/infiniband/hw/mlx4/qp.c | 253 ++-- drivers/infiniband/ulp/ipoib/ipoib_main.c |3 drivers/net/mlx4/cmd.c|6 drivers/net/mlx4/en_main.c| 15 drivers/net/mlx4/en_port.c|4 drivers/net/mlx4/en_port.h|3 drivers/net/mlx4/intf.c | 20 drivers/net/mlx4/main.c |6 drivers/net/mlx4/mlx4.h |1 include/linux/mlx4/cmd.h |1 include/linux/mlx4/device.h | 31 include/linux/mlx4/driver.h | 16 include/linux/mlx4/qp.h |8 include/rdma/ib_addr.h| 53 + include/rdma/ib_pack.h| 26 include/rdma/ib_user_verbs.h | 21 include/rdma/ib_verbs.h | 22 include/rdma/rdmaoe_sa.h | 66 ++ 36 files changed, 2333 insertions(+), 239 deletions(-) ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[ofa-general] [PATCH 1/8 v3] ib_core: Add API to support RDMAoE
Add two API functions needed for RDMAoE. ib_get_port_link_type() returns the link type support by the given device's port. It can be either PORT_LINK_IB for IB link layer or PORT_LINK_ETH for Ethernet links. Link type is reported to in query_port verb. ib_get_mac() will return the Ethernet MAC address leading to the port whose GID is spcified. This function is exported to userspace applications. ABI version is incremented from 6 to 7. Signed-off-by: Eli Cohen e...@mellanox.co.il --- drivers/infiniband/core/uverbs.h |1 + drivers/infiniband/core/uverbs_cmd.c | 33 + drivers/infiniband/core/uverbs_main.c |1 + drivers/infiniband/core/verbs.c | 17 + include/rdma/ib_user_verbs.h | 21 ++--- include/rdma/ib_verbs.h | 22 ++ 6 files changed, 92 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index b3ea958..e69b04c 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -194,5 +194,6 @@ IB_UVERBS_DECLARE_CMD(create_srq); IB_UVERBS_DECLARE_CMD(modify_srq); IB_UVERBS_DECLARE_CMD(query_srq); IB_UVERBS_DECLARE_CMD(destroy_srq); +IB_UVERBS_DECLARE_CMD(get_mac); #endif /* UVERBS_H */ diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 56feab6..eefc414 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -452,6 +452,7 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file, resp.active_width= attr.active_width; resp.active_speed= attr.active_speed; resp.phys_state = attr.phys_state; + resp.link_type = attr.link_type; if (copy_to_user((void __user *) (unsigned long) cmd.response, resp, sizeof resp)) @@ -1824,6 +1825,38 @@ err: return ret; } +ssize_t ib_uverbs_get_mac(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_get_maccmd; + struct ib_uverbs_get_mac_resp resp; + int ret; + struct ib_pd*pd; + + if (out_len sizeof resp) + return -ENOSPC; + + if (copy_from_user(cmd, buf, sizeof cmd)) + return -EFAULT; + + pd = idr_read_pd(cmd.pd_handle, file-ucontext); + if (!pd) + return -EINVAL; + + ret = ib_get_mac(pd-device, cmd.port, cmd.gid, resp.mac); + put_pd_read(pd); + if (!ret) { + if (copy_to_user((void __user *) (unsigned long) cmd.response, +resp, sizeof resp)) { + return -EFAULT; + } + return in_len; + } + + return ret; +} + ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) { diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index eb36a81..b2f148f 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -108,6 +108,7 @@ static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file, [IB_USER_VERBS_CMD_MODIFY_SRQ] = ib_uverbs_modify_srq, [IB_USER_VERBS_CMD_QUERY_SRQ] = ib_uverbs_query_srq, [IB_USER_VERBS_CMD_DESTROY_SRQ] = ib_uverbs_destroy_srq, + [IB_USER_VERBS_CMD_GET_MAC] = ib_uverbs_get_mac }; static struct vfsmount *uverbs_event_mnt; diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index a7da9be..bde5b0d 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -94,6 +94,13 @@ rdma_node_get_transport(enum rdma_node_type node_type) } EXPORT_SYMBOL(rdma_node_get_transport); +enum ib_port_link_type ib_get_port_link_type(struct ib_device *device, u8 port_num) +{ + return device-get_port_link_type ? + device-get_port_link_type(device, port_num) : PORT_LINK_IB; +} +EXPORT_SYMBOL(ib_get_port_link_type); + /* Protection domains */ struct ib_pd *ib_alloc_pd(struct ib_device *device) @@ -904,3 +911,13 @@ int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid) return qp-device-detach_mcast(qp, gid, lid); } EXPORT_SYMBOL(ib_detach_mcast); + +int ib_get_mac(struct ib_device *device, u8 port, u8 *gid, u8 *mac) +{ + if (!device-get_mac) + return -ENOSYS; + + return device-get_mac(device, port, gid, mac); +} +EXPORT_SYMBOL(ib_get_mac); + diff --git a/include/rdma/ib_user_verbs.h b/include/rdma/ib_user_verbs.h index a17f771..184203b 100644 --- a/include/rdma/ib_user_verbs.h +++ b/include/rdma/ib_user_verbs.h @@ -42,7 +42,7 @@ * Increment this value if any changes that break userspace ABI
[ofa-general] [PATCH 2/8 v3] ib_core: RDMAoE support only QP1
Since RDMAoE is using Ethernet as its link layer, there is no need for QP0. QP1 is still needed since it handles communications between CM agents. This patch will create only QP1 for RDMAoE ports. Signed-off-by: Eli Cohen e...@mellanox.co.il --- drivers/infiniband/core/agent.c | 12 ++--- drivers/infiniband/core/mad.c | 48 ++- 2 files changed, 45 insertions(+), 15 deletions(-) diff --git a/drivers/infiniband/core/agent.c b/drivers/infiniband/core/agent.c index ae7c288..c3f2048 100644 --- a/drivers/infiniband/core/agent.c +++ b/drivers/infiniband/core/agent.c @@ -48,6 +48,8 @@ struct ib_agent_port_private { struct list_head port_list; struct ib_mad_agent *agent[2]; + struct ib_device*device; + u8 port_num; }; static DEFINE_SPINLOCK(ib_agent_port_list_lock); @@ -58,11 +60,10 @@ __ib_get_agent_port(struct ib_device *device, int port_num) { struct ib_agent_port_private *entry; - list_for_each_entry(entry, ib_agent_port_list, port_list) { - if (entry-agent[0]-device == device - entry-agent[0]-port_num == port_num) + list_for_each_entry(entry, ib_agent_port_list, port_list) + if (entry-device == device entry-port_num == port_num) return entry; - } + return NULL; } @@ -175,6 +176,9 @@ int ib_agent_port_open(struct ib_device *device, int port_num) goto error3; } + port_priv-device = device; + port_priv-port_num = port_num; + spin_lock_irqsave(ib_agent_port_list_lock, flags); list_add_tail(port_priv-port_list, ib_agent_port_list); spin_unlock_irqrestore(ib_agent_port_list_lock, flags); diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index de922a0..3d5449f 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -199,6 +199,16 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, unsigned long flags; u8 mgmt_class, vclass; + /* Validate device and port */ + port_priv = ib_get_mad_port(device, port_num); + if (!port_priv) { + ret = ERR_PTR(-ENODEV); + goto error1; + } + + if (!port_priv-qp_info[qp_type].qp) + return NULL; + /* Validate parameters */ qpn = get_spl_qp_index(qp_type); if (qpn == -1) @@ -260,13 +270,6 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, goto error1; } - /* Validate device and port */ - port_priv = ib_get_mad_port(device, port_num); - if (!port_priv) { - ret = ERR_PTR(-ENODEV); - goto error1; - } - /* Allocate structures */ mad_agent_priv = kzalloc(sizeof *mad_agent_priv, GFP_KERNEL); if (!mad_agent_priv) { @@ -556,6 +559,9 @@ int ib_unregister_mad_agent(struct ib_mad_agent *mad_agent) struct ib_mad_agent_private *mad_agent_priv; struct ib_mad_snoop_private *mad_snoop_priv; + if (!mad_agent) + return 0; + /* If the TID is zero, the agent can only snoop. */ if (mad_agent-hi_tid) { mad_agent_priv = container_of(mad_agent, @@ -2602,6 +2608,9 @@ static void cleanup_recv_queue(struct ib_mad_qp_info *qp_info) struct ib_mad_private *recv; struct ib_mad_list_head *mad_list; + if (!qp_info-qp) + return; + while (!list_empty(qp_info-recv_queue.list)) { mad_list = list_entry(qp_info-recv_queue.list.next, @@ -2643,6 +2652,9 @@ static int ib_mad_port_start(struct ib_mad_port_private *port_priv) for (i = 0; i IB_MAD_QPS_CORE; i++) { qp = port_priv-qp_info[i].qp; + if (!qp) + continue; + /* * PKey index for QP1 is irrelevant but * one is needed for the Reset to Init transition @@ -2684,6 +2696,9 @@ static int ib_mad_port_start(struct ib_mad_port_private *port_priv) } for (i = 0; i IB_MAD_QPS_CORE; i++) { + if (!port_priv-qp_info[i].qp) + continue; + ret = ib_mad_post_receive_mads(port_priv-qp_info[i], NULL); if (ret) { printk(KERN_ERR PFX Couldn't post receive WRs\n); @@ -2762,6 +2777,9 @@ error: static void destroy_mad_qp(struct ib_mad_qp_info *qp_info) { + if (!qp_info-qp) + return; + ib_destroy_qp(qp_info-qp); kfree(qp_info-snoop_table); } @@ -2777,6 +2795,7 @@ static int ib_mad_port_open(struct ib_device *device, struct ib_mad_port_private *port_priv; unsigned long flags; char name[sizeof ib_mad123]; + int has_smi; /* Create new device info */ port_priv = kzalloc(sizeof
[ofa-general] [PATCH 3/8 v3] ib_core: Add RDMAoE SA support
Add support for resolving paths and joining multicast group for RDMAoE ports. For Eth links, path resolution will complete immediately but will call the callback from a workqueue context to avoid deadloks. Multicast joins are handled in nearly the same way as IB mulitcast joins are handled in multicast.c. However they are handled entirly at the host and no MADs are involved. This allows for a client to create groups and dictate the qkey to be used in that group. The code is put in rdmaoe_sa.c which handles both multicast joins/leaves and path resolution. The following files were added: drivers/infiniband/core/multicast.h drivers/infiniband/core/rdmaoe_sa.c include/rdma/rdmaoe_sa.h There are changes made in vers/infiniband/core/multicast.c, drivers/infiniband/core/sa_query.c and drivers/infiniband/core/sa.h to allow sharing of data structs. New API functions are added for RDMAoE and comsumenrs who want to use this API need to be changed. Signed-off-by: Eli Cohen e...@mellanox.co.il --- drivers/infiniband/core/Makefile|2 +- drivers/infiniband/core/multicast.c | 43 +-- drivers/infiniband/core/multicast.h | 79 +++ drivers/infiniband/core/rdmaoe_sa.c | 938 +++ drivers/infiniband/core/sa.h| 24 + drivers/infiniband/core/sa_query.c | 26 +- include/rdma/rdmaoe_sa.h| 66 +++ 7 files changed, 1113 insertions(+), 65 deletions(-) create mode 100644 drivers/infiniband/core/multicast.h create mode 100644 drivers/infiniband/core/rdmaoe_sa.c create mode 100644 include/rdma/rdmaoe_sa.h diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index cb1ab3e..96db705 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -2,7 +2,7 @@ infiniband-$(CONFIG_INFINIBAND_ADDR_TRANS) := ib_addr.o rdma_cm.o user_access-$(CONFIG_INFINIBAND_ADDR_TRANS):= rdma_ucm.o obj-$(CONFIG_INFINIBAND) +=ib_core.o ib_mad.o ib_sa.o \ - ib_cm.o iw_cm.o $(infiniband-y) + ib_cm.o iw_cm.o rdmaoe_sa.o $(infiniband-y) obj-$(CONFIG_INFINIBAND_USER_MAD) += ib_umad.o obj-$(CONFIG_INFINIBAND_USER_ACCESS) +=ib_uverbs.o ib_ucm.o \ $(user_access-y) diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c index 107f170..727a55a 100644 --- a/drivers/infiniband/core/multicast.c +++ b/drivers/infiniband/core/multicast.c @@ -39,6 +39,7 @@ #include rdma/ib_cache.h #include sa.h +#include multicast.h static void mcast_add_one(struct ib_device *device); static void mcast_remove_one(struct ib_device *device); @@ -72,52 +73,10 @@ struct mcast_device { struct mcast_port port[0]; }; -enum mcast_state { - MCAST_JOINING, - MCAST_MEMBER, - MCAST_ERROR, -}; - -enum mcast_group_state { - MCAST_IDLE, - MCAST_BUSY, - MCAST_GROUP_ERROR, - MCAST_PKEY_EVENT -}; - enum { MCAST_INVALID_PKEY_INDEX = 0x }; -struct mcast_member; - -struct mcast_group { - struct ib_sa_mcmember_rec rec; - struct rb_node node; - struct mcast_port *port; - spinlock_t lock; - struct work_struct work; - struct list_headpending_list; - struct list_headactive_list; - struct mcast_member *last_join; - int members[3]; - atomic_trefcount; - enum mcast_group_state state; - struct ib_sa_query *query; - int query_id; - u16 pkey_index; -}; - -struct mcast_member { - struct ib_sa_multicast multicast; - struct ib_sa_client *client; - struct mcast_group *group; - struct list_headlist; - enum mcast_statestate; - atomic_trefcount; - struct completion comp; -}; - static void join_handler(int status, struct ib_sa_mcmember_rec *rec, void *context); static void leave_handler(int status, struct ib_sa_mcmember_rec *rec, diff --git a/drivers/infiniband/core/multicast.h b/drivers/infiniband/core/multicast.h new file mode 100644 index 000..17eb9fe --- /dev/null +++ b/drivers/infiniband/core/multicast.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions
[ofa-general] [PATCH 4/8 v3] ib_core: CMA device binding
Add support for RDMAoE device binding and IP -- GID resolution. Modify calls to the SA such that the link type is veirifed first and the appropriate call is made to either IB SA or RDMAoE SA. Signed-off-by: Eli Cohen e...@mellanox.co.il --- drivers/infiniband/core/addr.c | 20 --- drivers/infiniband/core/cma.c | 124 +--- include/rdma/ib_addr.h | 53 + 3 files changed, 166 insertions(+), 31 deletions(-) diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index ce511d8..440e613 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -64,7 +64,7 @@ struct addr_req { static void process_req(struct work_struct *work); -static DEFINE_MUTEX(lock); +static DEFINE_SPINLOCK(lock); static LIST_HEAD(req_list); static DECLARE_DELAYED_WORK(work, process_req); static struct workqueue_struct *addr_wq; @@ -163,7 +163,7 @@ static void queue_req(struct addr_req *req) { struct addr_req *temp_req; - mutex_lock(lock); + spin_lock(lock); list_for_each_entry_reverse(temp_req, req_list, list) { if (time_after_eq(req-timeout, temp_req-timeout)) break; @@ -173,7 +173,7 @@ static void queue_req(struct addr_req *req) if (req_list.next == req-list) set_timeout(req-timeout); - mutex_unlock(lock); + spin_unlock(lock); } static void addr_send_arp(struct sockaddr *dst_in) @@ -207,7 +207,9 @@ static void addr_send_arp(struct sockaddr *dst_in) if (!dst) return; - neigh_event_send(dst-neighbour, NULL); + if (dst-neighbour) + neigh_event_send(dst-neighbour, NULL); + dst_release(dst); break; } @@ -322,7 +324,7 @@ static void process_req(struct work_struct *work) INIT_LIST_HEAD(done_list); - mutex_lock(lock); + spin_lock(lock); list_for_each_entry_safe(req, temp_req, req_list, list) { if (req-status == -ENODATA) { src_in = (struct sockaddr *) req-src_addr; @@ -341,7 +343,7 @@ static void process_req(struct work_struct *work) req = list_entry(req_list.next, struct addr_req, list); set_timeout(req-timeout); } - mutex_unlock(lock); + spin_unlock(lock); list_for_each_entry_safe(req, temp_req, done_list, list) { list_del(req-list); @@ -439,7 +441,7 @@ int rdma_resolve_ip(struct rdma_addr_client *client, struct addr_req *req; int ret = 0; - req = kzalloc(sizeof *req, GFP_KERNEL); + req = kzalloc(sizeof *req, GFP_ATOMIC); if (!req) return -ENOMEM; @@ -483,7 +485,7 @@ void rdma_addr_cancel(struct rdma_dev_addr *addr) { struct addr_req *req, *temp_req; - mutex_lock(lock); + spin_lock(lock); list_for_each_entry_safe(req, temp_req, req_list, list) { if (req-addr == addr) { req-status = -ECANCELED; @@ -493,7 +495,7 @@ void rdma_addr_cancel(struct rdma_dev_addr *addr) break; } } - mutex_unlock(lock); + spin_unlock(lock); } EXPORT_SYMBOL(rdma_addr_cancel); diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 851de83..6cf0f1b 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -49,6 +49,7 @@ #include rdma/ib_cache.h #include rdma/ib_cm.h #include rdma/ib_sa.h +#include rdma/rdmaoe_sa.h #include rdma/iw_cm.h MODULE_AUTHOR(Sean Hefty); @@ -69,6 +70,8 @@ static struct ib_client cma_client = { }; static struct ib_sa_client sa_client; +static struct ib_sa_client rdmaoe_sa_client; +static struct ib_sa_client rdmaoe_mcast_client; static struct rdma_addr_client addr_client; static LIST_HEAD(dev_list); static LIST_HEAD(listen_any_list); @@ -327,22 +330,30 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv) { struct rdma_dev_addr *dev_addr = id_priv-id.route.addr.dev_addr; struct cma_device *cma_dev; - union ib_gid gid; + union ib_gid gid, llgid, *tmp; int ret = -ENODEV; switch (rdma_node_get_transport(dev_addr-dev_type)) { case RDMA_TRANSPORT_IB: ib_addr_get_sgid(dev_addr, gid); + tmp = gid; break; case RDMA_TRANSPORT_IWARP: iw_addr_get_sgid(dev_addr, gid); + rdma_mac_to_ll_addr(dev_addr-src_dev_addr, llgid); break; default: return -ENODEV; } list_for_each_entry(cma_dev, dev_list, list) { - ret = ib_find_cached_gid(cma_dev-device, gid, + if (ib_get_port_link_type(cma_dev-device, id_priv-id.port_num) + == PORT_LINK_ETH
[ofa-general] [PATCH 5/8 v3] ib_core: RDMAoE UD packet packing support
Add support functions to aid in packing RDMAoE packets. Signed-off-by: Eli Cohen e...@mellanox.co.il --- drivers/infiniband/core/ud_header.c | 111 +++ include/rdma/ib_pack.h | 26 2 files changed, 137 insertions(+), 0 deletions(-) diff --git a/drivers/infiniband/core/ud_header.c b/drivers/infiniband/core/ud_header.c index 8ec7876..d04b6f2 100644 --- a/drivers/infiniband/core/ud_header.c +++ b/drivers/infiniband/core/ud_header.c @@ -80,6 +80,29 @@ static const struct ib_field lrh_table[] = { .size_bits= 16 } }; +static const struct ib_field eth_table[] = { + { STRUCT_FIELD(eth, dmac_h), + .offset_words = 0, + .offset_bits = 0, + .size_bits= 32 }, + { STRUCT_FIELD(eth, dmac_l), + .offset_words = 1, + .offset_bits = 0, + .size_bits= 16 }, + { STRUCT_FIELD(eth, smac_h), + .offset_words = 1, + .offset_bits = 16, + .size_bits= 16 }, + { STRUCT_FIELD(eth, smac_l), + .offset_words = 2, + .offset_bits = 0, + .size_bits= 32 }, + { STRUCT_FIELD(eth, type), + .offset_words = 3, + .offset_bits = 0, + .size_bits= 16 } +}; + static const struct ib_field grh_table[] = { { STRUCT_FIELD(grh, ip_version), .offset_words = 0, @@ -241,6 +264,53 @@ void ib_ud_header_init(int payload_bytes, EXPORT_SYMBOL(ib_ud_header_init); /** + * ib_rdmaoe_ud_header_init - Initialize UD header structure + * @payload_bytes:Length of packet payload + * @grh_present:GRH flag (if non-zero, GRH will be included) + * @header:Structure to initialize + * + * ib_rdmaoe_ud_header_init() initializes the grh.ip_version, grh.payload_length, + * grh.next_header, bth.opcode, bth.pad_count and + * bth.transport_header_version fields of a struct eth_ud_header given + * the payload length and whether a GRH will be included. + */ +void ib_rdmaoe_ud_header_init(int payload_bytes, + int grh_present, + struct eth_ud_header*header) +{ + int header_len; + + memset(header, 0, sizeof *header); + + header_len = + sizeof header-eth + + IB_BTH_BYTES + + IB_DETH_BYTES; + if (grh_present) + header_len += IB_GRH_BYTES; + + header-grh_present = grh_present; + if (grh_present) { + header-grh.ip_version = 6; + header-grh.payload_length = + cpu_to_be16((IB_BTH_BYTES + +IB_DETH_BYTES+ +payload_bytes+ +4+ /* ICRC */ +3) ~3); /* round up */ + header-grh.next_header = 0x1b; + } + + if (header-immediate_present) + header-bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE; + else + header-bth.opcode = IB_OPCODE_UD_SEND_ONLY; + header-bth.pad_count= (4 - payload_bytes) 3; + header-bth.transport_header_version = 0; +} +EXPORT_SYMBOL(ib_rdmaoe_ud_header_init); + +/** * ib_ud_header_pack - Pack UD header struct into wire format * @header:UD header struct * @buf:Buffer to pack into @@ -281,6 +351,47 @@ int ib_ud_header_pack(struct ib_ud_header *header, EXPORT_SYMBOL(ib_ud_header_pack); /** + * rdmaoe_ud_header_pack - Pack UD header struct into eth wire format + * @header:UD header struct + * @buf:Buffer to pack into + * + * ib_ud_header_pack() packs the UD header structure @header into wire + * format in the buffer @buf. + */ +int rdmaoe_ud_header_pack(struct eth_ud_header *header, + void *buf) +{ + int len = 0; + + ib_pack(eth_table, ARRAY_SIZE(eth_table), + header-eth, buf); + len += IB_ETH_BYTES; + + if (header-grh_present) { + ib_pack(grh_table, ARRAY_SIZE(grh_table), + header-grh, buf + len); + len += IB_GRH_BYTES; + } + + ib_pack(bth_table, ARRAY_SIZE(bth_table), + header-bth, buf + len); + len += IB_BTH_BYTES; + + ib_pack(deth_table, ARRAY_SIZE(deth_table), + header-deth, buf + len); + len += IB_DETH_BYTES; + + if (header-immediate_present) { + memcpy(buf + len, header-immediate_data, + sizeof header-immediate_data); + len += sizeof header-immediate_data; + } + + return len; +} +EXPORT_SYMBOL(rdmaoe_ud_header_pack); + +/** * ib_ud_header_unpack - Unpack UD header struct from wire format * @header:UD header struct * @buf:Buffer to pack into diff --git a/include/rdma/ib_pack.h b/include
[ofa-general] [PATCH 6/8 v3] IB/ipoib: restrict IPoIB to work on IB ports only
We don't want IPoIB to work over RDMAoE since it will give worse performance than working directly on Ethernet interfaces which are a prerequisite to RDMAoE anyway. Signed-off-by: Eli Cohen e...@mellanox.co.il --- drivers/infiniband/ulp/ipoib/ipoib_main.c |3 +++ 1 files changed, 3 insertions(+), 0 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index ab2c192..f12884c 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -1228,6 +1228,9 @@ static struct net_device *ipoib_add_port(const char *format, struct ib_port_attr attr; int result = -ENOMEM; + if (ib_get_port_link_type(hca, port) != PORT_LINK_IB) + return ERR_PTR(-ENODEV); + priv = ipoib_intf_alloc(format); if (!priv) goto alloc_mem_failed; -- 1.6.3.3 ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[ofa-general] [PATCH 7/8 v3] mlx4: Add support for RDMAoE - address resolution
The following path handles address vectors creation for RDMAoE ports. mlx4 needs the MAC address of the remote node to include it in the WQE of a UD QP or in the QP context of connected QPs. Address resolution is done atomically in the case of a link local address or using service from core/addr.c to do that. mlx4 transport packets were changed too to accomodate for RDMAoE. Signed-off-by: Eli Cohen e...@mellanox.co.il --- drivers/infiniband/hw/mlx4/ah.c | 228 ++- drivers/infiniband/hw/mlx4/mlx4_ib.h | 30 - drivers/infiniband/hw/mlx4/qp.c | 253 +++--- include/linux/mlx4/device.h | 31 - include/linux/mlx4/qp.h |8 +- 5 files changed, 463 insertions(+), 87 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c index c75ac94..c994e1f 100644 --- a/drivers/infiniband/hw/mlx4/ah.c +++ b/drivers/infiniband/hw/mlx4/ah.c @@ -31,63 +31,200 @@ */ #include mlx4_ib.h +#include rdma/ib_addr.h +#include linux/inet.h +#include linux/string.h -struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr) +static struct rdma_addr_client addr_client; + +struct resolve_ctx { + struct completion done; + int status; +}; + + +static int status2err(int status) { - struct mlx4_dev *dev = to_mdev(pd-device)-dev; - struct mlx4_ib_ah *ah; + return status; /* TBD */ +} - ah = kmalloc(sizeof *ah, GFP_ATOMIC); - if (!ah) - return ERR_PTR(-ENOMEM); +static void resolve_callback(int status, struct sockaddr *src_addr, +struct rdma_dev_addr *addr, void *context) +{ + struct resolve_ctx *ctx = context; - memset(ah-av, 0, sizeof ah-av); + ctx-status = status; + complete(ctx-done); +} - ah-av.port_pd = cpu_to_be32(to_mpd(pd)-pdn | (ah_attr-port_num 24)); - ah-av.g_slid = ah_attr-src_path_bits; - ah-av.dlid= cpu_to_be16(ah_attr-dlid); - if (ah_attr-static_rate) { - ah-av.stat_rate = ah_attr-static_rate + MLX4_STAT_RATE_OFFSET; - while (ah-av.stat_rate IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET - !(1 ah-av.stat_rate dev-caps.stat_rate_support)) - --ah-av.stat_rate; +int mlx4_ib_resolve_grh(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah_attr, + u8 *mac, int *is_mcast) +{ + struct mlx4_ib_rdmaoe *rdmaoe = dev-rdmaoe; + struct sockaddr_in6 dst = {0}; + struct rdma_dev_addr addr; + struct resolve_ctx ctx; + struct net_device *netdev; + int err = 0; + int ifidx; + + *is_mcast = 0; + spin_lock(rdmaoe-lock); + netdev = rdmaoe-netdevs[ah_attr-port_num - 1]; + if (!netdev) { + spin_unlock(rdmaoe-lock); + return -EINVAL; + } + ifidx = netdev-ifindex; + spin_unlock(rdmaoe-lock); + + init_completion(ctx.done); + memcpy(dst.sin6_addr.s6_addr, ah_attr-grh.dgid.raw, sizeof ah_attr-grh); + dst.sin6_family = AF_INET6; + dst.sin6_scope_id = ifidx; + if (rdma_link_local_addr(dst.sin6_addr)) + rdma_get_ll_mac(dst.sin6_addr, mac); + else if (rdma_is_multicast_addr(dst.sin6_addr)) { + rdma_get_mcast_mac(dst.sin6_addr, mac); + *is_mcast = 1; + } else { + err = rdma_resolve_ip(addr_client, NULL, (struct sockaddr *)dst, addr, + 2000, resolve_callback, ctx); + if (!err) + wait_for_completion(ctx.done); + else + ctx.status = err; + + err = status2err(ctx.status); } - ah-av.sl_tclass_flowlabel = cpu_to_be32(ah_attr-sl 28); + + return err; +} + +static struct ib_ah *create_ib_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr, + struct mlx4_ib_ah *ah) +{ + struct mlx4_dev *dev = to_mdev(pd-device)-dev; + + ah-av.ib.port_pd = cpu_to_be32(to_mpd(pd)-pdn | (ah_attr-port_num 24)); + ah-av.ib.g_slid = ah_attr-src_path_bits; if (ah_attr-ah_flags IB_AH_GRH) { - ah-av.g_slid |= 0x80; - ah-av.gid_index = ah_attr-grh.sgid_index; - ah-av.hop_limit = ah_attr-grh.hop_limit; - ah-av.sl_tclass_flowlabel |= + ah-av.ib.g_slid |= 0x80; + ah-av.ib.gid_index = ah_attr-grh.sgid_index; + ah-av.ib.hop_limit = ah_attr-grh.hop_limit; + ah-av.ib.sl_tclass_flowlabel |= cpu_to_be32((ah_attr-grh.traffic_class 20) | ah_attr-grh.flow_label); - memcpy(ah-av.dgid, ah_attr-grh.dgid.raw, 16); + memcpy(ah-av.ib.dgid, ah_attr-grh.dgid.raw, 16); } + ah-av.ib.dlid
[ofa-general] [PATCH 8/8 v3] mlx4: Add RDMAoE support - allow interfaces to correspond to each other
This patch add support RDMAoE for mlx4. Since mlx4_ib now needs to reference mlx4_en netdevices, a new mechanism was added. Two new fields were added to struct mlx4_interface to define a protocol and a get_prot_dev method to retrieve the corresponding protocol's net device. An implementation of the new verb ib_get_port_link_type() - mlx4_ib_get_port_link_type - was added. mlx4_ib_query_port() has been modified to support eth link types. An interface is considered to be active if its corresponding eth interface is active. Code for setting the GID table of a port has been added. Currently, each IB port has a single GID entry in its table and that GID entery equals the link local IPv6 address. Signed-off-by: Eli Cohen e...@mellanox.co.il --- drivers/infiniband/hw/mlx4/main.c | 276 + drivers/net/mlx4/cmd.c|6 + drivers/net/mlx4/en_main.c| 15 ++- drivers/net/mlx4/en_port.c|4 +- drivers/net/mlx4/en_port.h|3 +- drivers/net/mlx4/intf.c | 20 +++ drivers/net/mlx4/main.c |6 + drivers/net/mlx4/mlx4.h |1 + include/linux/mlx4/cmd.h |1 + include/linux/mlx4/driver.h | 16 ++- 10 files changed, 310 insertions(+), 38 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index ae3d759..40442b8 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -34,6 +34,7 @@ #include linux/module.h #include linux/init.h #include linux/errno.h +#include linux/netdevice.h #include rdma/ib_smi.h #include rdma/ib_user_verbs.h @@ -152,28 +153,19 @@ out: return err; } -static int mlx4_ib_query_port(struct ib_device *ibdev, u8 port, - struct ib_port_attr *props) +static enum ib_port_link_type mlx4_ib_get_port_link_type(struct ib_device *device, +u8 port_num) { - struct ib_smp *in_mad = NULL; - struct ib_smp *out_mad = NULL; - int err = -ENOMEM; - - in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); - out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); - if (!in_mad || !out_mad) - goto out; - - memset(props, 0, sizeof *props); - - init_query_mad(in_mad); - in_mad-attr_id = IB_SMP_ATTR_PORT_INFO; - in_mad-attr_mod = cpu_to_be32(port); + struct mlx4_dev *dev = to_mdev(device)-dev; - err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad); - if (err) - goto out; + return dev-caps.port_mask (1 (port_num - 1)) ? + PORT_LINK_IB : PORT_LINK_ETH; +} +static void ib_link_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props, + struct ib_smp *out_mad) +{ props-lid = be16_to_cpup((__be16 *) (out_mad-data + 16)); props-lmc = out_mad-data[34] 0x7; props-sm_lid = be16_to_cpup((__be16 *) (out_mad-data + 18)); @@ -193,6 +185,67 @@ static int mlx4_ib_query_port(struct ib_device *ibdev, u8 port, props-subnet_timeout = out_mad-data[51] 0x1f; props-max_vl_num = out_mad-data[37] 4; props-init_type_reply = out_mad-data[41] 4; + props-link_type= PORT_LINK_IB; +} + +static void eth_link_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props, + struct ib_smp *out_mad) +{ + struct mlx4_ib_rdmaoe *rdmaoe = to_mdev(ibdev)-rdmaoe; + struct net_device *ndev; + + props-port_cap_flags = be32_to_cpup((__be32 *) (out_mad-data + 20)); + props-gid_tbl_len = to_mdev(ibdev)-dev-caps.gid_table_len[port]; + props-max_msg_sz = to_mdev(ibdev)-dev-caps.max_msg_sz; + props-pkey_tbl_len = to_mdev(ibdev)-dev-caps.pkey_table_len[port]; + props-bad_pkey_cntr= be16_to_cpup((__be16 *) (out_mad-data + 46)); + props-qkey_viol_cntr = be16_to_cpup((__be16 *) (out_mad-data + 48)); + props-active_width = out_mad-data[31] 0xf; + props-active_speed = out_mad-data[35] 4; + props-max_mtu = out_mad-data[41] 0xf; + props-active_mtu = rdmaoe-mtu[port - 1]; + props-subnet_timeout = out_mad-data[51] 0x1f; + props-max_vl_num = out_mad-data[37] 4; + props-init_type_reply = out_mad-data[41] 4; + props-link_type= PORT_LINK_ETH; + spin_lock(rdmaoe-lock); + ndev = rdmaoe-netdevs[port - 1]; + if (!ndev) + goto out; + + props-state= netif_running(ndev) netif_oper_up(ndev) ? + IB_PORT_ACTIVE : IB_PORT_DOWN; + props-phys_state = props-state; +out: + spin_unlock(rdmaoe-lock); +} + +static int
RE: [ofa-general] Write combining support in OFED 1.5
In PPC arch HPAGE_SHIFT is an unexported, variable (because huge page sizes are different there). The huge pages patch needs to reference this value (through HPAGE_SIZE). -Original Message- From: Jack Morgenstein [mailto:ja...@dev.mellanox.co.il] Sent: Tuesday, July 07, 2009 2:14 PM To: Or Gerlitz; Eli Cohen Cc: rdre...@cisco.com; general@lists.openfabrics.org Subject: Re: [ofa-general] Write combining support in OFED 1.5 Adding Eli Cohen (author of the huge-page support patch). Eli, what is missing on the PPC regarding huge page support? -Jack On Tuesday 07 July 2009 13:54, Or Gerlitz wrote: Jack Morgenstein wrote: Yes, see kernel_patches/fixes/mlx4_0010_add_wc.patch. With OFED 1.5, I am trying to use the kernel-based WC support, and eliminate the patch. To do this, I need 2 things: 1. macro pgprot_writecombine to be defined for PPC/PPC64 under the arch directory (file arch/powerpc/include/asm/pgtable.h) 2. A way of determining (via the standard kernel) if write-combining is available on a given platform (to allow disabling use of ConnectX blueflame in userspace if WC is not available). It seems that none of the issues are IB specific, correct? As for PPC guys I think we have another open issue with them, regarding huge pages support, a patch which isn't mainline. Maybe you can use the opportunity to set http://lists.openfabrics.org/pipermail/general/2009-January/056812.htm l as well... Or. ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[ofa-general] XRC userspace git trees
Roland, where can I find your userspace trees with XRC support? ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[ofa-general] Re: [ewg] Re: [PATCH 2/9] ib_core: kernel API for GID -- MAC translations
On Wed, Jun 17, 2009 at 11:19:04AM -0700, Roland Dreier wrote: diff --git a/include/rdma/ib_user_verbs.h b/include/rdma/ib_user_verbs.h index a17f771..5bfa2e6 100644 --- a/include/rdma/ib_user_verbs.h +++ b/include/rdma/ib_user_verbs.h @@ -81,7 +81,8 @@ enum { IB_USER_VERBS_CMD_MODIFY_SRQ, IB_USER_VERBS_CMD_QUERY_SRQ, IB_USER_VERBS_CMD_DESTROY_SRQ, - IB_USER_VERBS_CMD_POST_SRQ_RECV + IB_USER_VERBS_CMD_POST_SRQ_RECV, + IB_USER_VERBS_CMD_GET_MAC }; This breaks the ABI defined for XRC. So I guess these patches are not really intended to be applied? I definitely want to finish up XRC before starting to look at this series in detail anyway. Roland, on top of which trees would like me to rebase the patches on and update the ABI version? ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
Re: [ofa-general] [PATCH] libibverbs: Add RDMAoE support
On Mon, Jun 15, 2009 at 10:32:03AM -0600, Jason Gunthorpe wrote: On Mon, Jun 15, 2009 at 04:42:36PM +0300, Eli Cohen wrote: Please use inet_pton(AF_INET6,grh,gid) for this? I was searching for something like this... Thanks, I'll fix this. + if (ctx-dgid.global.interface_id) { + attr.ah_attr.is_global = 1; + attr.ah_attr.grh.hop_limit = 1; + attr.ah_attr.grh.dgid = ctx-dgid; + } We've been working to get rid of stuff like this for the IB router work. Can you call into the SA path resolution stuff instead of doing this? Please note this is only an example which initially did not use SA services so I did not change that too. ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[ofa-general] Re: [ewg] [PATCH 3/9] ib_core: RDMAoE support only QP1
On Mon, Jun 15, 2009 at 02:26:42PM -0400, Hal Rosenstock wrote: Should ib_post_send_mad return some error on QP0 sends on RDMAoE ports ? You can't send anything over QP0 because it is not created and so there are no data structs corresponding to it. What QP1 sends are allowed ? Basically, all QP1 sends are allowed without any changes - QP1 functions as normal. However, RDMAoE will initially support only the CM. In the future, we can support additional QP1 services. Is it only SA sends which are faked ? Yes How are others handled ? These questions are important to what happens to the IB management/diag tools when invoked on an RDMAoE port. We need to be able to handle that scenario cleanly. You should be able to to send MADs over QP1 from the kernel. I did not make any tests as for sending MADs from userspace but I can't think of a reason why this would be a problem. Is something similar needed in ib_mad_port_close for handling no QP0 on RDMAoE ports in terms of destroy_mad_qp/cleanup_recv_queue calls ? No, becuase it is handled inside destroy_mad_qp(): if (!qp_info-qp) return; ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
Re: [ewg] RE: [ofa-general] [PATCH 4/9] ib_core: Add RDMAoE SA support
On Tue, Jun 16, 2009 at 10:27:33AM -0700, Sean Hefty wrote: + +w-member-multicast.rec.qkey = cpu_to_be32(0xc2c); How can a user control this? An app needs the same qkey for unicast traffic. In RDMAoE, the qkey has a fixed well-known value, which will be returned both by multicast and path queries. +atomic_inc(w-member-refcount); This needs to be moved below... I don't follow you - please explain. +static struct ib_sa_multicast * +eth_join_multicast(struct ib_sa_client *client, + struct ib_device *device, u8 port_num, + struct ib_sa_mcmember_rec *rec, + ib_sa_comp_mask comp_mask, gfp_t gfp_mask, + int (*callback)(int status, + struct ib_sa_multicast *multicast), + void *context) +{ +struct mcast_device *dev; +struct eth_work *w; +struct mcast_member *member; +int err; + +dev = ib_get_client_data(device, mcast_client); +if (!dev) +return ERR_PTR(-ENODEV); + +member = kzalloc(sizeof *member, gfp_mask); +if (!member) +return ERR_PTR(-ENOMEM); + +w = kzalloc(sizeof *w, gfp_mask); +if (!w) { +err = -ENOMEM; +goto out1; +} +w-member = member; +w-device = device; +w-port_num = port_num; + +member-multicast.context = context; +member-multicast.callback = callback; +member-client = client; +member-multicast.rec.mgid = rec-mgid; +init_completion(member-comp); +atomic_set(member-refcount, 1); + +ib_sa_client_get(client); +INIT_WORK(w-work, eth_mcast_work_handler); +queue_work(mcast_wq, w-work); + +return member-multicast; The user could leave/destroy the multicast join request before the queued work item runs. We need to hold an additional reference on the member until the work item completes. Yes, thanks for catching this. I'll fix and resend. There's substantial differences in functionality between an IB multicast group and the Ethernet group. I would rather see the differences hidden by the rdma_cm, than the IB SA module. This is a question of transparency. The motivation is to enable non-cma apps that use the ib_sa_join_multicast() API to work without changes. diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 1865049..7bf9b5c 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -45,6 +45,7 @@ There's not a userspace interface into the sa_query module. How will those apps work, or apps that send MADs on QPs other than QP1? Currently, these apps won't work. Sending MADs directly on QPs other than QP1 will not work. However, we expect that a userpsace interface to the sa_query module will be implemented in the future, which will forward queries to the kernel module. Naturally, most kernel ULPs and user-level apps will use these standard interfaces instead of implementing MAD queries themselves. ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
Re: [ofa-general] [PATCH 0/9] RDMAoE - RDMA over Ethernety
On Tue, Jun 16, 2009 at 11:16:28AM -0500, Steve Wise wrote: Hey Eli, Does this series implement UD/multicast support? I didn't see it with a quick perusal of the patches. Hi Steve, yes UD multicast is supported. Note that in this version of the pathces I use the broadcast MAC address (all FF's) for all multicast but this will be changed and GIDs will be mapped to multicast addresses. ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[ofa-general] Re: [ewg] [PATCH 3/9] ib_core: RDMAoE support only QP1
On Wed, Jun 17, 2009 at 07:14:56AM -0400, Hal Rosenstock wrote: On Wed, Jun 17, 2009 at 7:10 AM, Eli Cohene...@dev.mellanox.co.il wrote: On Mon, Jun 15, 2009 at 02:26:42PM -0400, Hal Rosenstock wrote: Should ib_post_send_mad return some error on QP0 sends on RDMAoE ports ? You can't send anything over QP0 because it is not created and so there are no data structs corresponding to it. Yes, I understand that's the intention but I didn't see where a MAD posted to QP0 returns an error. Does that occur ? Or is it just silently dropped ? But you don't have a struct ib_qp * for QP0 that you could use to post MADs to QP0... What QP1 sends are allowed ? Basically, all QP1 sends are allowed without any changes - QP1 functions as normal. However, RDMAoE will initially support only the CM. In the future, we can support additional QP1 services. So what happens with other QP1 sends now ? Do they go into hyperspace and then timeout ? SA joins and SA path queries are terminated at the driver. Otherwise, post sends on QP1 should be sent on the wire. There are a set of tools (infiniband-diags and ibutils) which do send MADs from userspace. I'm concerned that if someone tries these, the right thing will happen. What exactly do you mean? ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[ofa-general] [PATCH 0/9] RDMAoE - RDMA over Ethernet
RDMA over Ethernet (RDMAoE) allows running the IB transport protocol over Ethernet, providing IB capabilities for Ethernet fabrics. The packets are standard Ethernet frames with an Ethertype, an IB GRH, unmodified IB transport headers and payload. HCA RDMAoE ports are no different than regular IB ports from the RDMA stack perspective. IB subnet management and SA services are not required for RDMAoE operation; Ethernet management practices are used instead. In Ethernet, nodes are commonly referred to by applications by means of an IP address. RDMAoE treats IP addresses that were assigned to the corresponding Ethernet port as GIDs, and makes use of the IP stack to bind a destination address to the corresponding netdevice (just as the CMA does today for IB and iWARP) and to obtain its L2 MAC addresses. The RDMA Verbs API is syntactically unmodified. When referring to RDMAoE ports, Address handles are required to contain GIDs and the L2 address fields in the API are ignored. The Ethernet L2 information is then obtained by the vendor-specific driver (both in kernel- and user-space) while modifying QPs to RTR and creating address handles. In order to maintain application compatibility, RDMAoE implements a SA_Query API that locally returns path records with the corresponding GIDs and the other relevant parameters . Consequently, any CMA or native Verbs application, in kernel or user-space, that uses path queries to obtain its address information, will run transparently over RDMAoE with no changes. We have successfully tested MPI, SDP, RDS, and native Verbs applications over RDMAoE without *any* changes. In the mlx4 driver stack, mlx4_en must be loaded and the corresponding eth Ethernet (with support for RDMAoE) or IB, as it was already the case. Following is a series of 9 patches based on version 2.6.30 of the Linux kernel. Signed-off-by: Eli Cohen e...@mellanox.co.il drivers/infiniband/core/addr.c| 20 ++- drivers/infiniband/core/agent.c | 16 +- drivers/infiniband/core/cma.c | 39 - drivers/infiniband/core/mad.c | 48 -- drivers/infiniband/core/multicast.c | 153 ++-- drivers/infiniband/core/sa_query.c| 167 ++ drivers/infiniband/core/ud_header.c | 111 drivers/infiniband/core/uverbs.h |1 + drivers/infiniband/core/uverbs_cmd.c | 34 drivers/infiniband/core/uverbs_main.c |1 + drivers/infiniband/core/verbs.c | 18 ++ drivers/infiniband/hw/mlx4/ah.c | 228 drivers/infiniband/hw/mlx4/main.c | 276 ++--- drivers/infiniband/hw/mlx4/mlx4_ib.h | 30 +++- drivers/infiniband/hw/mlx4/qp.c | 253 +- drivers/infiniband/ulp/ipoib/ipoib_main.c |3 + drivers/net/mlx4/cmd.c|6 + drivers/net/mlx4/en_main.c| 15 ++- drivers/net/mlx4/en_port.c|4 +- drivers/net/mlx4/en_port.h|3 +- drivers/net/mlx4/intf.c | 20 ++ drivers/net/mlx4/main.c |6 + drivers/net/mlx4/mlx4.h |1 + include/linux/mlx4/cmd.h |1 + include/linux/mlx4/device.h | 31 +++- include/linux/mlx4/driver.h | 16 ++- include/linux/mlx4/qp.h |8 +- include/rdma/ib_addr.h| 51 ++ include/rdma/ib_pack.h| 26 +++ include/rdma/ib_user_verbs.h | 21 ++- include/rdma/ib_verbs.h | 22 +++ 31 files changed, 1419 insertions(+), 210 deletions(-) ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[ofa-general] [PATCH 1/9] ib_core: Add API to query port link type
This allows to get the type of a port to be either Ethernet or IB which is required by following patches for implementing RDMA over Ethernet - RDMAoE. Signed-off-by: Eli Cohen e...@mellanox.co.il --- drivers/infiniband/core/verbs.c |7 +++ include/rdma/ib_verbs.h | 11 +++ 2 files changed, 18 insertions(+), 0 deletions(-) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index a7da9be..18631b1 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -94,6 +94,13 @@ rdma_node_get_transport(enum rdma_node_type node_type) } EXPORT_SYMBOL(rdma_node_get_transport); +enum ib_port_link_type ib_get_port_link_type(struct ib_device *device, u8 port_num) +{ + return device-get_port_link_type ? + device-get_port_link_type(device, port_num) : PORT_LINK_IB; +} +EXPORT_SYMBOL(ib_get_port_link_type); + /* Protection domains */ struct ib_pd *ib_alloc_pd(struct ib_device *device) diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index c179318..8f98c8f 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -72,6 +72,11 @@ enum rdma_transport_type { RDMA_TRANSPORT_IWARP }; +enum ib_port_link_type { + PORT_LINK_IB, + PORT_LINK_ETH +}; + enum rdma_transport_type rdma_node_get_transport(enum rdma_node_type node_type) __attribute_const__; @@ -298,6 +303,7 @@ struct ib_port_attr { u8 active_width; u8 active_speed; u8 phys_state; + enum ib_port_link_type link_type; }; enum ib_device_modify_flags { @@ -1003,6 +1009,8 @@ struct ib_device { int(*query_port)(struct ib_device *device, u8 port_num, struct ib_port_attr *port_attr); + enum ib_port_link_type (*get_port_link_type)(struct ib_device *device, +u8 port_num); int(*query_gid)(struct ib_device *device, u8 port_num, int index, union ib_gid *gid); @@ -1213,6 +1221,9 @@ int ib_query_device(struct ib_device *device, int ib_query_port(struct ib_device *device, u8 port_num, struct ib_port_attr *port_attr); +enum ib_port_link_type ib_get_port_link_type(struct ib_device *device, +u8 port_num); + int ib_query_gid(struct ib_device *device, u8 port_num, int index, union ib_gid *gid); -- 1.6.3.1 ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[ofa-general] [PATCH 2/9] ib_core: kernel API for GID -- MAC translations
A few support functions are added to allow the translation from GID to MAC which is required by hw drivers supporting RDMAoE. Signed-off-by: Eli Cohen e...@mellanox.co.il --- drivers/infiniband/core/uverbs.h |1 + drivers/infiniband/core/uverbs_cmd.c | 34 + drivers/infiniband/core/uverbs_main.c |1 + drivers/infiniband/core/verbs.c | 11 ++ include/rdma/ib_user_verbs.h | 21 +-- include/rdma/ib_verbs.h | 11 ++ 6 files changed, 76 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index b3ea958..e69b04c 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -194,5 +194,6 @@ IB_UVERBS_DECLARE_CMD(create_srq); IB_UVERBS_DECLARE_CMD(modify_srq); IB_UVERBS_DECLARE_CMD(query_srq); IB_UVERBS_DECLARE_CMD(destroy_srq); +IB_UVERBS_DECLARE_CMD(get_mac); #endif /* UVERBS_H */ diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 56feab6..0f1821d 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -452,6 +452,7 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file, resp.active_width= attr.active_width; resp.active_speed= attr.active_speed; resp.phys_state = attr.phys_state; + resp.link_type = attr.link_type; if (copy_to_user((void __user *) (unsigned long) cmd.response, resp, sizeof resp)) @@ -1824,6 +1825,39 @@ err: return ret; } +ssize_t ib_uverbs_get_mac(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_get_maccmd; + struct ib_uverbs_get_mac_resp resp; + int ret; + struct ib_pd*pd; + + if (out_len sizeof resp) + return -ENOSPC; + + if (copy_from_user(cmd, buf, sizeof cmd)) + return -EFAULT; + + pd = idr_read_pd(cmd.pd_handle, file-ucontext); + if (!pd) + return -EINVAL; + + ret = ib_get_mac(pd-device, cmd.port, cmd.gid, resp.mac); + put_pd_read(pd); + if (!ret) { + if (copy_to_user((void __user *) (unsigned long) cmd.response, +resp, sizeof resp)) { + return -EFAULT; + } + return in_len; + } + + + return ret; +} + ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) { diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index eb36a81..b2f148f 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -108,6 +108,7 @@ static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file, [IB_USER_VERBS_CMD_MODIFY_SRQ] = ib_uverbs_modify_srq, [IB_USER_VERBS_CMD_QUERY_SRQ] = ib_uverbs_query_srq, [IB_USER_VERBS_CMD_DESTROY_SRQ] = ib_uverbs_destroy_srq, + [IB_USER_VERBS_CMD_GET_MAC] = ib_uverbs_get_mac }; static struct vfsmount *uverbs_event_mnt; diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 18631b1..e3600c7 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -911,3 +911,14 @@ int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid) return qp-device-detach_mcast(qp, gid, lid); } EXPORT_SYMBOL(ib_detach_mcast); + + +int ib_get_mac(struct ib_device *device, u8 port, u8 *gid, u8 *mac) +{ + if (!device-get_mac) + return -ENOSYS; + + return device-get_mac(device, port, gid, mac); +} +EXPORT_SYMBOL(ib_get_mac); + diff --git a/include/rdma/ib_user_verbs.h b/include/rdma/ib_user_verbs.h index a17f771..5bfa2e6 100644 --- a/include/rdma/ib_user_verbs.h +++ b/include/rdma/ib_user_verbs.h @@ -81,7 +81,8 @@ enum { IB_USER_VERBS_CMD_MODIFY_SRQ, IB_USER_VERBS_CMD_QUERY_SRQ, IB_USER_VERBS_CMD_DESTROY_SRQ, - IB_USER_VERBS_CMD_POST_SRQ_RECV + IB_USER_VERBS_CMD_POST_SRQ_RECV, + IB_USER_VERBS_CMD_GET_MAC }; /* @@ -204,8 +205,9 @@ struct ib_uverbs_query_port_resp { __u8 init_type_reply; __u8 active_width; __u8 active_speed; - __u8 phys_state; - __u8 reserved[3]; + __u8 phys_state; + __u8 link_type; + __u8 reserved[2]; }; struct ib_uverbs_alloc_pd { @@ -621,6 +623,19 @@ struct ib_uverbs_destroy_ah { __u32 ah_handle; }; +struct ib_uverbs_get_mac { + __u64 response; + __u32 pd_handle; + __u8 port; + __u8 reserved[3]; + __u8 gid[16]; +}; + +struct ib_uverbs_get_mac_resp { + __u8mac[6
[ofa-general] [PATCH 3/9] ib_core: RDMAoE support only QP1
Since RDMAoE is using Ethernet there is no need for QP0. This patch will create only QP1 for RDMAoE ports. Signed-off-by: Eli Cohen e...@mellanox.co.il --- drivers/infiniband/core/agent.c | 16 - drivers/infiniband/core/mad.c | 48 ++- 2 files changed, 47 insertions(+), 17 deletions(-) diff --git a/drivers/infiniband/core/agent.c b/drivers/infiniband/core/agent.c index ae7c288..658a278 100644 --- a/drivers/infiniband/core/agent.c +++ b/drivers/infiniband/core/agent.c @@ -46,8 +46,10 @@ #define SPFX ib_agent: struct ib_agent_port_private { - struct list_head port_list; - struct ib_mad_agent *agent[2]; + struct list_headport_list; + struct ib_mad_agent*agent[2]; + struct ib_device *device; + u8 port_num; }; static DEFINE_SPINLOCK(ib_agent_port_list_lock); @@ -58,11 +60,10 @@ __ib_get_agent_port(struct ib_device *device, int port_num) { struct ib_agent_port_private *entry; - list_for_each_entry(entry, ib_agent_port_list, port_list) { - if (entry-agent[0]-device == device - entry-agent[0]-port_num == port_num) + list_for_each_entry(entry, ib_agent_port_list, port_list) + if (entry-device == device entry-port_num == port_num) return entry; - } + return NULL; } @@ -175,6 +176,9 @@ int ib_agent_port_open(struct ib_device *device, int port_num) goto error3; } + port_priv-device = device; + port_priv-port_num = port_num; + spin_lock_irqsave(ib_agent_port_list_lock, flags); list_add_tail(port_priv-port_list, ib_agent_port_list); spin_unlock_irqrestore(ib_agent_port_list_lock, flags); diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index de922a0..aadf396 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -199,6 +199,16 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, unsigned long flags; u8 mgmt_class, vclass; + /* Validate device and port */ + port_priv = ib_get_mad_port(device, port_num); + if (!port_priv) { + ret = ERR_PTR(-ENODEV); + goto error1; + } + + if (!port_priv-qp_info[qp_type].qp) + return NULL; + /* Validate parameters */ qpn = get_spl_qp_index(qp_type); if (qpn == -1) @@ -260,13 +270,6 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, goto error1; } - /* Validate device and port */ - port_priv = ib_get_mad_port(device, port_num); - if (!port_priv) { - ret = ERR_PTR(-ENODEV); - goto error1; - } - /* Allocate structures */ mad_agent_priv = kzalloc(sizeof *mad_agent_priv, GFP_KERNEL); if (!mad_agent_priv) { @@ -556,6 +559,9 @@ int ib_unregister_mad_agent(struct ib_mad_agent *mad_agent) struct ib_mad_agent_private *mad_agent_priv; struct ib_mad_snoop_private *mad_snoop_priv; + if (!mad_agent) + return 0; + /* If the TID is zero, the agent can only snoop. */ if (mad_agent-hi_tid) { mad_agent_priv = container_of(mad_agent, @@ -2602,6 +2608,9 @@ static void cleanup_recv_queue(struct ib_mad_qp_info *qp_info) struct ib_mad_private *recv; struct ib_mad_list_head *mad_list; + if (!qp_info-qp) + return; + while (!list_empty(qp_info-recv_queue.list)) { mad_list = list_entry(qp_info-recv_queue.list.next, @@ -2643,6 +2652,9 @@ static int ib_mad_port_start(struct ib_mad_port_private *port_priv) for (i = 0; i IB_MAD_QPS_CORE; i++) { qp = port_priv-qp_info[i].qp; + if (!qp) + continue; + /* * PKey index for QP1 is irrelevant but * one is needed for the Reset to Init transition @@ -2684,6 +2696,9 @@ static int ib_mad_port_start(struct ib_mad_port_private *port_priv) } for (i = 0; i IB_MAD_QPS_CORE; i++) { + if (!port_priv-qp_info[i].qp) + continue; + ret = ib_mad_post_receive_mads(port_priv-qp_info[i], NULL); if (ret) { printk(KERN_ERR PFX Couldn't post receive WRs\n); @@ -2762,6 +2777,9 @@ error: static void destroy_mad_qp(struct ib_mad_qp_info *qp_info) { + if (!qp_info-qp) + return; + ib_destroy_qp(qp_info-qp); kfree(qp_info-snoop_table); } @@ -2777,6 +2795,7 @@ static int ib_mad_port_open(struct ib_device *device, struct ib_mad_port_private *port_priv; unsigned long flags; char name[sizeof ib_mad123]; + int has_smi; /* Create new device info
[ofa-general] [PATCH 4/9] ib_core: Add RDMAoE SA support
Add support for resolving paths and joining multicast group for RDMAoE ports. The Ethernet specific code will complete immediately but will call the callback from a workqueue context to avoid deadloks. Signed-off-by: Eli Cohen e...@mellanox.co.il --- drivers/infiniband/core/multicast.c | 153 drivers/infiniband/core/sa_query.c | 167 +++--- 2 files changed, 269 insertions(+), 51 deletions(-) diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c index 107f170..2417f6b 100644 --- a/drivers/infiniband/core/multicast.c +++ b/drivers/infiniband/core/multicast.c @@ -488,6 +488,36 @@ retest: } } +struct eth_work { + struct work_struct work; + struct mcast_member *member; + struct ib_device*device; + u8 port_num; +}; + +static void eth_mcast_work_handler(struct work_struct *work) +{ + struct eth_work *w = container_of(work, struct eth_work, work); + int err; + struct ib_port_attr port_attr; + int status = 0; + + err = ib_query_port(w-device, w-port_num, port_attr); + if (err) + status = err; + else if (port_attr.state != IB_PORT_ACTIVE) + status = -EAGAIN; + + w-member-multicast.rec.qkey = cpu_to_be32(0xc2c); + atomic_inc(w-member-refcount); + err = w-member-multicast.callback(status, w-member-multicast); + deref_member(w-member); + if (err) + ib_sa_free_multicast(w-member-multicast); + + kfree(w); +} + /* * Fail a join request if it is still active - at the head of the pending queue. */ @@ -586,21 +616,14 @@ found: return group; } -/* - * We serialize all join requests to a single group to make our lives much - * easier. Otherwise, two users could try to join the same group - * simultaneously, with different configurations, one could leave while the - * join is in progress, etc., which makes locking around error recovery - * difficult. - */ -struct ib_sa_multicast * -ib_sa_join_multicast(struct ib_sa_client *client, -struct ib_device *device, u8 port_num, -struct ib_sa_mcmember_rec *rec, -ib_sa_comp_mask comp_mask, gfp_t gfp_mask, -int (*callback)(int status, -struct ib_sa_multicast *multicast), -void *context) +static struct ib_sa_multicast * +ib_join_multicast(struct ib_sa_client *client, + struct ib_device *device, u8 port_num, + struct ib_sa_mcmember_rec *rec, + ib_sa_comp_mask comp_mask, gfp_t gfp_mask, + int (*callback)(int status, + struct ib_sa_multicast *multicast), + void *context) { struct mcast_device *dev; struct mcast_member *member; @@ -647,9 +670,81 @@ err: kfree(member); return ERR_PTR(ret); } + +static struct ib_sa_multicast * +eth_join_multicast(struct ib_sa_client *client, + struct ib_device *device, u8 port_num, + struct ib_sa_mcmember_rec *rec, + ib_sa_comp_mask comp_mask, gfp_t gfp_mask, + int (*callback)(int status, + struct ib_sa_multicast *multicast), + void *context) +{ + struct mcast_device *dev; + struct eth_work *w; + struct mcast_member *member; + int err; + + dev = ib_get_client_data(device, mcast_client); + if (!dev) + return ERR_PTR(-ENODEV); + + member = kzalloc(sizeof *member, gfp_mask); + if (!member) + return ERR_PTR(-ENOMEM); + + w = kzalloc(sizeof *w, gfp_mask); + if (!w) { + err = -ENOMEM; + goto out1; + } + w-member = member; + w-device = device; + w-port_num = port_num; + + member-multicast.context = context; + member-multicast.callback = callback; + member-client = client; + member-multicast.rec.mgid = rec-mgid; + init_completion(member-comp); + atomic_set(member-refcount, 1); + + ib_sa_client_get(client); + INIT_WORK(w-work, eth_mcast_work_handler); + queue_work(mcast_wq, w-work); + + return member-multicast; + +out1: + kfree(member); + return ERR_PTR(err); +} + +/* + * We serialize all join requests to a single group to make our lives much + * easier. Otherwise, two users could try to join the same group + * simultaneously, with different configurations, one could leave while the + * join is in progress, etc., which makes locking around error recovery + * difficult. + */ +struct ib_sa_multicast * +ib_sa_join_multicast(struct ib_sa_client *client, +struct ib_device *device, u8 port_num, +struct ib_sa_mcmember_rec *rec
[ofa-general] [PATCH 5/9] ib_core: CMA device binding
Add support for RDMAoE device binding and IP -- GID resolution. Signed-off-by: Eli Cohen e...@mellanox.co.il --- drivers/infiniband/core/addr.c | 20 --- drivers/infiniband/core/cma.c | 39 ++ include/rdma/ib_addr.h | 51 3 files changed, 96 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index ce511d8..440e613 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -64,7 +64,7 @@ struct addr_req { static void process_req(struct work_struct *work); -static DEFINE_MUTEX(lock); +static DEFINE_SPINLOCK(lock); static LIST_HEAD(req_list); static DECLARE_DELAYED_WORK(work, process_req); static struct workqueue_struct *addr_wq; @@ -163,7 +163,7 @@ static void queue_req(struct addr_req *req) { struct addr_req *temp_req; - mutex_lock(lock); + spin_lock(lock); list_for_each_entry_reverse(temp_req, req_list, list) { if (time_after_eq(req-timeout, temp_req-timeout)) break; @@ -173,7 +173,7 @@ static void queue_req(struct addr_req *req) if (req_list.next == req-list) set_timeout(req-timeout); - mutex_unlock(lock); + spin_unlock(lock); } static void addr_send_arp(struct sockaddr *dst_in) @@ -207,7 +207,9 @@ static void addr_send_arp(struct sockaddr *dst_in) if (!dst) return; - neigh_event_send(dst-neighbour, NULL); + if (dst-neighbour) + neigh_event_send(dst-neighbour, NULL); + dst_release(dst); break; } @@ -322,7 +324,7 @@ static void process_req(struct work_struct *work) INIT_LIST_HEAD(done_list); - mutex_lock(lock); + spin_lock(lock); list_for_each_entry_safe(req, temp_req, req_list, list) { if (req-status == -ENODATA) { src_in = (struct sockaddr *) req-src_addr; @@ -341,7 +343,7 @@ static void process_req(struct work_struct *work) req = list_entry(req_list.next, struct addr_req, list); set_timeout(req-timeout); } - mutex_unlock(lock); + spin_unlock(lock); list_for_each_entry_safe(req, temp_req, done_list, list) { list_del(req-list); @@ -439,7 +441,7 @@ int rdma_resolve_ip(struct rdma_addr_client *client, struct addr_req *req; int ret = 0; - req = kzalloc(sizeof *req, GFP_KERNEL); + req = kzalloc(sizeof *req, GFP_ATOMIC); if (!req) return -ENOMEM; @@ -483,7 +485,7 @@ void rdma_addr_cancel(struct rdma_dev_addr *addr) { struct addr_req *req, *temp_req; - mutex_lock(lock); + spin_lock(lock); list_for_each_entry_safe(req, temp_req, req_list, list) { if (req-addr == addr) { req-status = -ECANCELED; @@ -493,7 +495,7 @@ void rdma_addr_cancel(struct rdma_dev_addr *addr) break; } } - mutex_unlock(lock); + spin_unlock(lock); } EXPORT_SYMBOL(rdma_addr_cancel); diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 851de83..0f15d9d 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -327,22 +327,30 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv) { struct rdma_dev_addr *dev_addr = id_priv-id.route.addr.dev_addr; struct cma_device *cma_dev; - union ib_gid gid; + union ib_gid gid, llgid, *tmp; int ret = -ENODEV; switch (rdma_node_get_transport(dev_addr-dev_type)) { case RDMA_TRANSPORT_IB: ib_addr_get_sgid(dev_addr, gid); + tmp = gid; break; case RDMA_TRANSPORT_IWARP: iw_addr_get_sgid(dev_addr, gid); + rdma_mac_to_ll_addr(dev_addr-src_dev_addr, llgid); break; default: return -ENODEV; } list_for_each_entry(cma_dev, dev_list, list) { - ret = ib_find_cached_gid(cma_dev-device, gid, + if (ib_get_port_link_type(cma_dev-device, id_priv-id.port_num) + == PORT_LINK_ETH + rdma_node_get_transport(dev_addr-dev_type) == + RDMA_TRANSPORT_IWARP) + tmp = llgid; + + ret = ib_find_cached_gid(cma_dev-device, tmp, id_priv-id.port_num, NULL); if (!ret) { cma_attach_to_dev(id_priv, cma_dev); @@ -1032,7 +1040,19 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id, if (rt-num_paths == 2) rt-path_rec[1] = *ib_event-param.req_rcvd.alternate_path; - ib_addr_set_dgid(rt
[ofa-general] [PATCH 6/9] ib_core: RDMAoE UD packet packing support
Add support functions to aid in packing RDMAoE packets. Signed-off-by: Eli Cohen e...@mellanox.co.il --- drivers/infiniband/core/ud_header.c | 111 +++ include/rdma/ib_pack.h | 26 2 files changed, 137 insertions(+), 0 deletions(-) diff --git a/drivers/infiniband/core/ud_header.c b/drivers/infiniband/core/ud_header.c index 8ec7876..d04b6f2 100644 --- a/drivers/infiniband/core/ud_header.c +++ b/drivers/infiniband/core/ud_header.c @@ -80,6 +80,29 @@ static const struct ib_field lrh_table[] = { .size_bits= 16 } }; +static const struct ib_field eth_table[] = { + { STRUCT_FIELD(eth, dmac_h), + .offset_words = 0, + .offset_bits = 0, + .size_bits= 32 }, + { STRUCT_FIELD(eth, dmac_l), + .offset_words = 1, + .offset_bits = 0, + .size_bits= 16 }, + { STRUCT_FIELD(eth, smac_h), + .offset_words = 1, + .offset_bits = 16, + .size_bits= 16 }, + { STRUCT_FIELD(eth, smac_l), + .offset_words = 2, + .offset_bits = 0, + .size_bits= 32 }, + { STRUCT_FIELD(eth, type), + .offset_words = 3, + .offset_bits = 0, + .size_bits= 16 } +}; + static const struct ib_field grh_table[] = { { STRUCT_FIELD(grh, ip_version), .offset_words = 0, @@ -241,6 +264,53 @@ void ib_ud_header_init(int payload_bytes, EXPORT_SYMBOL(ib_ud_header_init); /** + * ib_rdmaoe_ud_header_init - Initialize UD header structure + * @payload_bytes:Length of packet payload + * @grh_present:GRH flag (if non-zero, GRH will be included) + * @header:Structure to initialize + * + * ib_rdmaoe_ud_header_init() initializes the grh.ip_version, grh.payload_length, + * grh.next_header, bth.opcode, bth.pad_count and + * bth.transport_header_version fields of a struct eth_ud_header given + * the payload length and whether a GRH will be included. + */ +void ib_rdmaoe_ud_header_init(int payload_bytes, + int grh_present, + struct eth_ud_header*header) +{ + int header_len; + + memset(header, 0, sizeof *header); + + header_len = + sizeof header-eth + + IB_BTH_BYTES + + IB_DETH_BYTES; + if (grh_present) + header_len += IB_GRH_BYTES; + + header-grh_present = grh_present; + if (grh_present) { + header-grh.ip_version = 6; + header-grh.payload_length = + cpu_to_be16((IB_BTH_BYTES + +IB_DETH_BYTES+ +payload_bytes+ +4+ /* ICRC */ +3) ~3); /* round up */ + header-grh.next_header = 0x1b; + } + + if (header-immediate_present) + header-bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE; + else + header-bth.opcode = IB_OPCODE_UD_SEND_ONLY; + header-bth.pad_count= (4 - payload_bytes) 3; + header-bth.transport_header_version = 0; +} +EXPORT_SYMBOL(ib_rdmaoe_ud_header_init); + +/** * ib_ud_header_pack - Pack UD header struct into wire format * @header:UD header struct * @buf:Buffer to pack into @@ -281,6 +351,47 @@ int ib_ud_header_pack(struct ib_ud_header *header, EXPORT_SYMBOL(ib_ud_header_pack); /** + * rdmaoe_ud_header_pack - Pack UD header struct into eth wire format + * @header:UD header struct + * @buf:Buffer to pack into + * + * ib_ud_header_pack() packs the UD header structure @header into wire + * format in the buffer @buf. + */ +int rdmaoe_ud_header_pack(struct eth_ud_header *header, + void *buf) +{ + int len = 0; + + ib_pack(eth_table, ARRAY_SIZE(eth_table), + header-eth, buf); + len += IB_ETH_BYTES; + + if (header-grh_present) { + ib_pack(grh_table, ARRAY_SIZE(grh_table), + header-grh, buf + len); + len += IB_GRH_BYTES; + } + + ib_pack(bth_table, ARRAY_SIZE(bth_table), + header-bth, buf + len); + len += IB_BTH_BYTES; + + ib_pack(deth_table, ARRAY_SIZE(deth_table), + header-deth, buf + len); + len += IB_DETH_BYTES; + + if (header-immediate_present) { + memcpy(buf + len, header-immediate_data, + sizeof header-immediate_data); + len += sizeof header-immediate_data; + } + + return len; +} +EXPORT_SYMBOL(rdmaoe_ud_header_pack); + +/** * ib_ud_header_unpack - Unpack UD header struct from wire format * @header:UD header struct * @buf:Buffer to pack into diff --git a/include/rdma/ib_pack.h b/include
[ofa-general] [PATCH 8/9] mlx4: Add support for RDMAoE - address resolution
The following path handles address vectors creation for RDMAoE ports. mlx4 needs the MAC address of the remote node to include it in the WQE of a UD QP or in the QP context of connected QPs. Address resolution is done atomically in the case of a link local address or using service from core/addr.c to do that. mlx4 transport packets were changed too to accomodate for RDMAoE. Signed-off-by: Eli Cohen e...@mellanox.co.il --- drivers/infiniband/hw/mlx4/ah.c | 228 ++- drivers/infiniband/hw/mlx4/mlx4_ib.h | 30 - drivers/infiniband/hw/mlx4/qp.c | 253 +++--- include/linux/mlx4/device.h | 31 - include/linux/mlx4/qp.h |8 +- 5 files changed, 463 insertions(+), 87 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c index c75ac94..c994e1f 100644 --- a/drivers/infiniband/hw/mlx4/ah.c +++ b/drivers/infiniband/hw/mlx4/ah.c @@ -31,63 +31,200 @@ */ #include mlx4_ib.h +#include rdma/ib_addr.h +#include linux/inet.h +#include linux/string.h -struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr) +static struct rdma_addr_client addr_client; + +struct resolve_ctx { + struct completion done; + int status; +}; + + +static int status2err(int status) { - struct mlx4_dev *dev = to_mdev(pd-device)-dev; - struct mlx4_ib_ah *ah; + return status; /* TBD */ +} - ah = kmalloc(sizeof *ah, GFP_ATOMIC); - if (!ah) - return ERR_PTR(-ENOMEM); +static void resolve_callback(int status, struct sockaddr *src_addr, +struct rdma_dev_addr *addr, void *context) +{ + struct resolve_ctx *ctx = context; - memset(ah-av, 0, sizeof ah-av); + ctx-status = status; + complete(ctx-done); +} - ah-av.port_pd = cpu_to_be32(to_mpd(pd)-pdn | (ah_attr-port_num 24)); - ah-av.g_slid = ah_attr-src_path_bits; - ah-av.dlid= cpu_to_be16(ah_attr-dlid); - if (ah_attr-static_rate) { - ah-av.stat_rate = ah_attr-static_rate + MLX4_STAT_RATE_OFFSET; - while (ah-av.stat_rate IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET - !(1 ah-av.stat_rate dev-caps.stat_rate_support)) - --ah-av.stat_rate; +int mlx4_ib_resolve_grh(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah_attr, + u8 *mac, int *is_mcast) +{ + struct mlx4_ib_rdmaoe *rdmaoe = dev-rdmaoe; + struct sockaddr_in6 dst = {0}; + struct rdma_dev_addr addr; + struct resolve_ctx ctx; + struct net_device *netdev; + int err = 0; + int ifidx; + + *is_mcast = 0; + spin_lock(rdmaoe-lock); + netdev = rdmaoe-netdevs[ah_attr-port_num - 1]; + if (!netdev) { + spin_unlock(rdmaoe-lock); + return -EINVAL; + } + ifidx = netdev-ifindex; + spin_unlock(rdmaoe-lock); + + init_completion(ctx.done); + memcpy(dst.sin6_addr.s6_addr, ah_attr-grh.dgid.raw, sizeof ah_attr-grh); + dst.sin6_family = AF_INET6; + dst.sin6_scope_id = ifidx; + if (rdma_link_local_addr(dst.sin6_addr)) + rdma_get_ll_mac(dst.sin6_addr, mac); + else if (rdma_is_multicast_addr(dst.sin6_addr)) { + rdma_get_mcast_mac(dst.sin6_addr, mac); + *is_mcast = 1; + } else { + err = rdma_resolve_ip(addr_client, NULL, (struct sockaddr *)dst, addr, + 2000, resolve_callback, ctx); + if (!err) + wait_for_completion(ctx.done); + else + ctx.status = err; + + err = status2err(ctx.status); } - ah-av.sl_tclass_flowlabel = cpu_to_be32(ah_attr-sl 28); + + return err; +} + +static struct ib_ah *create_ib_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr, + struct mlx4_ib_ah *ah) +{ + struct mlx4_dev *dev = to_mdev(pd-device)-dev; + + ah-av.ib.port_pd = cpu_to_be32(to_mpd(pd)-pdn | (ah_attr-port_num 24)); + ah-av.ib.g_slid = ah_attr-src_path_bits; if (ah_attr-ah_flags IB_AH_GRH) { - ah-av.g_slid |= 0x80; - ah-av.gid_index = ah_attr-grh.sgid_index; - ah-av.hop_limit = ah_attr-grh.hop_limit; - ah-av.sl_tclass_flowlabel |= + ah-av.ib.g_slid |= 0x80; + ah-av.ib.gid_index = ah_attr-grh.sgid_index; + ah-av.ib.hop_limit = ah_attr-grh.hop_limit; + ah-av.ib.sl_tclass_flowlabel |= cpu_to_be32((ah_attr-grh.traffic_class 20) | ah_attr-grh.flow_label); - memcpy(ah-av.dgid, ah_attr-grh.dgid.raw, 16); + memcpy(ah-av.ib.dgid, ah_attr-grh.dgid.raw, 16); } + ah-av.ib.dlid
[ofa-general] [PATCH] libibverbs: Add RDMAoE support
Extend the ibv_query_port() verb to return enum ibv_port_link_type which reports the link type to be either IB or Ethernet. This can be used by applications to know if they must use GRH as is the case in RDMAoE. Add a new system call to get the MAC address of the remote port that a UD address vector refers to. Update ibv_rc_pingpong and ibv_ud_pingpong to accept a remote GID so that they can be used with an RDMAoE port. Signed-off-by: Eli Cohen e...@mellanox.co.il --- examples/devinfo.c| 13 ++ examples/pingpong.c | 31 + examples/pingpong.h |3 ++ examples/rc_pingpong.c| 50 examples/ud_pingpong.c| 37 +++--- include/infiniband/driver.h |1 + include/infiniband/kern-abi.h | 23 +- include/infiniband/verbs.h|7 + src/cmd.c | 20 src/libibverbs.map|1 + 10 files changed, 170 insertions(+), 16 deletions(-) diff --git a/examples/devinfo.c b/examples/devinfo.c index caa5d5f..fc9dbb9 100644 --- a/examples/devinfo.c +++ b/examples/devinfo.c @@ -175,6 +175,18 @@ static int print_all_port_gids(struct ibv_context *ctx, uint8_t port_num, int tb return rc; } +static const char *link_type_str(enum ibv_port_link_type type) +{ + switch (type) { + case PORT_LINK_IB: + return PORT_LINK_IB; + case PORT_LINK_ETH: + return PORT_LINK_ETH; + default: + return Unknown; + } +} + static int print_hca_cap(struct ibv_device *ib_dev, uint8_t ib_port) { struct ibv_context *ctx; @@ -273,6 +285,7 @@ static int print_hca_cap(struct ibv_device *ib_dev, uint8_t ib_port) printf(\t\t\tsm_lid:\t\t\t%d\n, port_attr.sm_lid); printf(\t\t\tport_lid:\t\t%d\n, port_attr.lid); printf(\t\t\tport_lmc:\t\t0x%02x\n, port_attr.lmc); + printf(\t\t\tlink_type:\t\t%s\n, link_type_str(port_attr.link_type)); if (verbose) { printf(\t\t\tmax_msg_sz:\t\t0x%x\n, port_attr.max_msg_sz); diff --git a/examples/pingpong.c b/examples/pingpong.c index b916f59..e53e2fa 100644 --- a/examples/pingpong.c +++ b/examples/pingpong.c @@ -31,6 +31,8 @@ */ #include pingpong.h +#include arpa/inet.h +#include stdlib.h enum ibv_mtu pp_mtu_to_enum(int mtu) { @@ -53,3 +55,32 @@ uint16_t pp_get_local_lid(struct ibv_context *context, int port) return attr.lid; } + +int pp_get_port_info(struct ibv_context *context, int port, +struct ibv_port_attr *attr) +{ + return ibv_query_port(context, port, attr); +} + +void str2gid(char *grh, union ibv_gid *gid) +{ + char tmp; + + tmp = grh[8]; + grh[8] = 0; + gid-dwords[0] = htonl(strtoul(grh, NULL, 16)); + grh[8] = tmp; + + tmp = grh[16]; + grh[16] = 0; + gid-dwords[1] = htonl(strtoul(grh + 8, NULL, 16)); + grh[16] = tmp; + + tmp = grh[24]; + grh[24] = 0; + gid-dwords[2] = htonl(strtoul(grh + 16, NULL, 16)); + grh[24] = tmp; + + gid-dwords[3] = htonl(strtoul(grh + 24, NULL, 16)); +} + diff --git a/examples/pingpong.h b/examples/pingpong.h index 71d7c3f..8c82b32 100644 --- a/examples/pingpong.h +++ b/examples/pingpong.h @@ -37,5 +37,8 @@ enum ibv_mtu pp_mtu_to_enum(int mtu); uint16_t pp_get_local_lid(struct ibv_context *context, int port); +int pp_get_port_info(struct ibv_context *context, int port, +struct ibv_port_attr *attr); +void str2gid(char *grh, union ibv_gid *gid); #endif /* IBV_PINGPONG_H */ diff --git a/examples/rc_pingpong.c b/examples/rc_pingpong.c index 26fa45c..23cad7a 100644 --- a/examples/rc_pingpong.c +++ b/examples/rc_pingpong.c @@ -67,6 +67,8 @@ struct pingpong_context { int size; int rx_depth; int pending; + struct ibv_port_attr portinfo; + union ibv_giddgid; }; struct pingpong_dest { @@ -94,6 +96,12 @@ static int pp_connect_ctx(struct pingpong_context *ctx, int port, int my_psn, .port_num = port } }; + + if (ctx-dgid.global.interface_id) { + attr.ah_attr.is_global = 1; + attr.ah_attr.grh.hop_limit = 1; + attr.ah_attr.grh.dgid = ctx-dgid; + } if (ibv_modify_qp(ctx-qp, attr, IBV_QP_STATE | IBV_QP_AV | @@ -289,11 +297,11 @@ out: static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size, int rx_depth, int port, - int use_event) + int use_event, int is_server
[ofa-general] [PATCH] libmlx4: Add RDMAoE support
Modify mlx4_create_ah() to check the port link type and for Ethernet ports, do a system call to retrieve the remote port's MAC address. Add ConnectX EN 10GigE PCIe gen2 to the list of supported device. Make modifications to address vector data structs and code to accomodate for RDMAoE. Signed-off-by: Eli Cohen e...@mellanox.co.il --- src/mlx4.c |1 + src/mlx4.h |3 +++ src/qp.c|2 ++ src/verbs.c | 14 ++ src/wqe.h |3 ++- 5 files changed, 22 insertions(+), 1 deletions(-) diff --git a/src/mlx4.c b/src/mlx4.c index 34ece39..d2e32fd 100644 --- a/src/mlx4.c +++ b/src/mlx4.c @@ -66,6 +66,7 @@ struct { HCA(MELLANOX, 0x6354), /* MT25408 Hermon QDR */ HCA(MELLANOX, 0x6732), /* MT25408 Hermon DDR PCIe gen2 */ HCA(MELLANOX, 0x673c), /* MT25408 Hermon QDR PCIe gen2 */ + HCA(MELLANOX, 0x6750), /* MT25408 Hermon EN 10GigE PCIe gen2 */ }; static struct ibv_context_ops mlx4_ctx_ops = { diff --git a/src/mlx4.h b/src/mlx4.h index 827a201..20d3fdd 100644 --- a/src/mlx4.h +++ b/src/mlx4.h @@ -236,11 +236,14 @@ struct mlx4_av { uint8_t hop_limit; uint32_tsl_tclass_flowlabel; uint8_t dgid[16]; + uint8_t mac[8]; }; struct mlx4_ah { struct ibv_ah ibv_ah; struct mlx4_av av; + uint16_tvlan; + uint8_t mac[6]; }; static inline unsigned long align(unsigned long val, unsigned long align) diff --git a/src/qp.c b/src/qp.c index d194ae3..cd8fab0 100644 --- a/src/qp.c +++ b/src/qp.c @@ -143,6 +143,8 @@ static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg, memcpy(dseg-av, to_mah(wr-wr.ud.ah)-av, sizeof (struct mlx4_av)); dseg-dqpn = htonl(wr-wr.ud.remote_qpn); dseg-qkey = htonl(wr-wr.ud.remote_qkey); + dseg-vlan = htons(to_mah(wr-wr.ud.ah)-vlan); + memcpy(dseg-mac, to_mah(wr-wr.ud.ah)-mac, 6); } static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg) diff --git a/src/verbs.c b/src/verbs.c index cc179a0..ff59a93 100644 --- a/src/verbs.c +++ b/src/verbs.c @@ -617,6 +617,7 @@ int mlx4_destroy_qp(struct ibv_qp *ibqp) struct ibv_ah *mlx4_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr) { struct mlx4_ah *ah; + struct ibv_port_attr port_attr; ah = malloc(sizeof *ah); if (!ah) @@ -642,7 +643,20 @@ struct ibv_ah *mlx4_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr) memcpy(ah-av.dgid, attr-grh.dgid.raw, 16); } + if (ibv_query_port(pd-context, attr-port_num, port_attr)) + goto err; + + if (port_attr.link_type == PORT_LINK_ETH) { + if (ibv_cmd_get_mac(pd, attr-port_num, ah-av.dgid, ah-mac)) + goto err; + ah-vlan = 0; + } + + return ah-ibv_ah; +err: + free(ah); + return NULL; } int mlx4_destroy_ah(struct ibv_ah *ah) diff --git a/src/wqe.h b/src/wqe.h index 6f7f309..ea6f27f 100644 --- a/src/wqe.h +++ b/src/wqe.h @@ -78,7 +78,8 @@ struct mlx4_wqe_datagram_seg { uint32_tav[8]; uint32_tdqpn; uint32_tqkey; - uint32_treserved[2]; + __be16 vlan; + uint8_t mac[6]; }; struct mlx4_wqe_data_seg { -- 1.6.3.1 ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
Re: [ofa-general] Ipoib child interfaces are created in dgram mode, even if the config file (and main interface) specify connected mode.
On Mon, Jun 01, 2009 at 09:01:23AM -0500, Mike Heinz wrote: One of our testers recently reported that he was getting different performance numbers from IPOIB depending on whether he was using interface ib0 or a child interface he had created with /sys/class/netib0/create_child. Investigating, I noticed that the two interfaces have different MTU sizes: ib0 Link encap:UNSPEC HWaddr 80-00-00-02-FE-80-00-00-00-00-00-00-00-00-00-00 inet addr:172.21.33.58 Bcast:172.21.255.255 Mask:255.255.0.0 inet6 addr: fe80::211:7500:ff:8fa4/64 Scope:Link UP BROADCAST RUNNING MULTICAST MTU:65520 Metric:1 RX packets:76 errors:0 dropped:0 overruns:0 frame:0 TX packets:23147668 errors:0 dropped:0 overruns:0 carrier:0 collisions:0 txqueuelen:256 RX bytes:5284 (5.1 Kb) TX bytes:17365044664 (16560.5 Mb) ib0.9001 Link encap:UNSPEC HWaddr 80-00-00-06-FE-80-00-00-00-00-00-00-00-00-00-00 inet addr:172.21.34.58 Bcast:172.21.255.255 Mask:255.255.0.0 inet6 addr: fe80::211:7500:ff:8fa4/64 Scope:Link UP BROADCAST RUNNING MULTICAST MTU:2044 Metric:1 RX packets:60413619 errors:0 dropped:0 overruns:0 frame:0 TX packets:0 errors:0 dropped:0 overruns:0 carrier:0 collisions:0 txqueuelen:256 RX bytes:112572449188 (107357.4 Mb) TX bytes:0 (0.0 b) This leads me to believe that the child interface is in datagram mode even though the parent is in connected mode. Probably it is, but you can always check the mode by reading the corresponding sysfs file: /sys/class/net/ib0.9001/mode Is this expected behavior? Yes, it is the expected behaviour. Is there a way to change this? Yes, it is possible -- Michael Heinz Principal Engineer, Qlogic Corporation King of Prussia, Pennsylvania ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
Re: [ofa-general] [RFC] ipoib: avoid using stale ipoib_neigh* in ipoib_neigh_cleanup()
On Sun, May 31, 2009 at 09:41:54AM +0300, Or Gerlitz wrote: akep...@sgi.com wrote @ http://lists.openfabrics.org/pipermail/general/2009-May/059730.html What would prevent a race between a tx completion (with an error) and the cleanup of a neighbour? Okay, so maybe this code/design of using the stashed ipoib_neighbour at the tx completion code is the root cause of all these troubles?! From a quick look on the code and two patches that touched this area (f56bcd801... Use separate CQ for UD send completions and 57ce41d1... Fix transmit queue stalling forever) - I see that the original tx cq handler - ipoib_ib_handle_tx_wc() doesn't touch the neigbour but today is called only from the drain timer dev-stop flows. Now, ipoib_cm_handle_tx_wc() is called for normal flow both for datagram and connected modes, and this function touches he neighbour. Or, I don't follow on you - ipoib_cm_handle_tx_wc() called ipoib_neigh_free() from the first commit. Also please note the following designation of CQs: recv_cq: used for all receives and for CM send send_cq: used for UD send Thus, since in ipoib_poll() we poll recv_cq, any none receive must be that of CM mode sends. I am not sure why commit f56bcd801... made UD completions to go through ipoib_cm_handle_tx_wc() nor why this function must use the neighbor to access the data-structure it needs to, maybe Eli can comment on that? Or. ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
Re: [ofa-general] Re: [PATCH 2/3] libmlx4 - Optimize memory allocation of QP buffers with 64K pages
On Wed, May 20, 2009 at 08:00:47AM +0200, sebastien dugue wrote: Well not really, because if we stay below MMAP_THRESHOLD, as we do with 4K pages, the only overhead is malloc's chaining structure. The extra space used to align the buffer is released before posix_memalign() returns, but that does increase fragmentation of mallocs chunks. Also, for 4K pages, mmap() systematically results in a syscall whereas posix_memalign() does not necessarily, but as we're not on a fast path I'm not sure what would be best. I don't mind converting all QP buffers allocation to mmap(), but I'd like to hear what people think. If the only reasoning behind using a MMAP_THRESHOLD is to avoid the system call for smaller allocations, then I think we'd better use a uniform allocation scheme -- mmap -- as you proposed and not distinguish between the two cases. ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[ofa-general] [PATCH 1/2] mlx4_core: Use module parameter for number of MTTs per segment
The current MTTs allocator uses kmalloc to allocate a buffer for it's buddy system implementation and thus is limited by the amount of MTT segments that it can control. As a result, the size of memory that can be registered is limited too. This patch uses a module parameter to control the number of MTT entries that each segment represents, thus allowing to register more memory with the same number of segments. Signed-off-by: Eli Cohen e...@mellanox.co.il --- drivers/net/mlx4/main.c | 14 -- drivers/net/mlx4/mr.c |6 +++--- drivers/net/mlx4/profile.c |2 +- include/linux/mlx4/device.h |1 + 4 files changed, 17 insertions(+), 6 deletions(-) diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c index 30bea96..018348c 100644 --- a/drivers/net/mlx4/main.c +++ b/drivers/net/mlx4/main.c @@ -100,6 +100,10 @@ module_param_named(use_prio, use_prio, bool, 0444); MODULE_PARM_DESC(use_prio, Enable steering by VLAN priority on ETH ports (0/1, default 0)); +static int log_mtts_per_seg = ilog2(MLX4_MTT_ENTRY_PER_SEG); +module_param_named(log_mtts_per_seg, log_mtts_per_seg, int, 0444); +MODULE_PARM_DESC(log_mtts_per_seg, Log2 number of MTT entries per segment (1-5)); + int mlx4_check_port_params(struct mlx4_dev *dev, enum mlx4_port_type *port_type) { @@ -203,12 +207,13 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) dev-caps.max_cqes = dev_cap-max_cq_sz - 1; dev-caps.reserved_cqs = dev_cap-reserved_cqs; dev-caps.reserved_eqs = dev_cap-reserved_eqs; + dev-caps.mtts_per_seg = 1 log_mtts_per_seg; dev-caps.reserved_mtts = DIV_ROUND_UP(dev_cap-reserved_mtts, - MLX4_MTT_ENTRY_PER_SEG); + dev-caps.mtts_per_seg); dev-caps.reserved_mrws = dev_cap-reserved_mrws; dev-caps.reserved_uars = dev_cap-reserved_uars; dev-caps.reserved_pds = dev_cap-reserved_pds; - dev-caps.mtt_entry_sz = MLX4_MTT_ENTRY_PER_SEG * dev_cap-mtt_entry_sz; + dev-caps.mtt_entry_sz = dev-caps.mtts_per_seg * dev_cap-mtt_entry_sz; dev-caps.max_msg_sz = dev_cap-max_msg_sz; dev-caps.page_size_cap = ~(u32) (dev_cap-min_page_sz - 1); dev-caps.flags = dev_cap-flags; @@ -1304,6 +1309,11 @@ static int __init mlx4_verify_params(void) return -1; } + if ((log_mtts_per_seg 1) || (log_mtts_per_seg 5)) { + printk(KERN_WARNING mlx4_core: bad log_mtts_per_seg: %d\n, log_mtts_per_seg); + return -1; + } + return 0; } diff --git a/drivers/net/mlx4/mr.c b/drivers/net/mlx4/mr.c index 0caf74c..3b8973d 100644 --- a/drivers/net/mlx4/mr.c +++ b/drivers/net/mlx4/mr.c @@ -209,7 +209,7 @@ int mlx4_mtt_init(struct mlx4_dev *dev, int npages, int page_shift, } else mtt-page_shift = page_shift; - for (mtt-order = 0, i = MLX4_MTT_ENTRY_PER_SEG; i npages; i = 1) + for (mtt-order = 0, i = dev-caps.mtts_per_seg; i npages; i = 1) ++mtt-order; mtt-first_seg = mlx4_alloc_mtt_range(dev, mtt-order); @@ -350,7 +350,7 @@ int mlx4_mr_enable(struct mlx4_dev *dev, struct mlx4_mr *mr) mpt_entry-pd_flags |= cpu_to_be32(MLX4_MPT_PD_FLAG_FAST_REG | MLX4_MPT_PD_FLAG_RAE); mpt_entry-mtt_sz= cpu_to_be32((1 mr-mtt.order) * - MLX4_MTT_ENTRY_PER_SEG); + dev-caps.mtts_per_seg); } else { mpt_entry-flags|= cpu_to_be32(MLX4_MPT_FLAG_SW_OWNS); } @@ -391,7 +391,7 @@ static int mlx4_write_mtt_chunk(struct mlx4_dev *dev, struct mlx4_mtt *mtt, (start_index + npages - 1) / (PAGE_SIZE / sizeof (u64))) return -EINVAL; - if (start_index (MLX4_MTT_ENTRY_PER_SEG - 1)) + if (start_index (dev-caps.mtts_per_seg - 1)) return -EINVAL; mtts = mlx4_table_find(priv-mr_table.mtt_table, mtt-first_seg + diff --git a/drivers/net/mlx4/profile.c b/drivers/net/mlx4/profile.c index cebdf32..bd22df9 100644 --- a/drivers/net/mlx4/profile.c +++ b/drivers/net/mlx4/profile.c @@ -98,7 +98,7 @@ u64 mlx4_make_profile(struct mlx4_dev *dev, profile[MLX4_RES_EQ].size = dev_cap-eqc_entry_sz; profile[MLX4_RES_DMPT].size = dev_cap-dmpt_entry_sz; profile[MLX4_RES_CMPT].size = dev_cap-cmpt_entry_sz; - profile[MLX4_RES_MTT].size= MLX4_MTT_ENTRY_PER_SEG * dev_cap-mtt_entry_sz; + profile[MLX4_RES_MTT].size= dev-caps.mtts_per_seg * dev_cap-mtt_entry_sz; profile[MLX4_RES_MCG].size= MLX4_MGM_ENTRY_SIZE; profile[MLX4_RES_QP].num = request-num_qp
[ofa-general] [PATCH 2/2] ib_mthca: Use module parameter for number of MTTs per segment
The current MTTs allocator uses kmalloc to allocate a buffer for it's buddy system implementation and thus is limited by the amount of MTT segments that it can control. As a result, the size of memory that can be registered is limited too. This patch uses a module parameter to control the number of MTT entries that each segment represents, thus allowing to register more memory with the same number of segments. Signed-off-by: Eli Cohen e...@mellanox.co.il --- drivers/infiniband/hw/mthca/mthca_cmd.c |2 +- drivers/infiniband/hw/mthca/mthca_dev.h |1 + drivers/infiniband/hw/mthca/mthca_main.c| 17 ++--- drivers/infiniband/hw/mthca/mthca_mr.c | 16 drivers/infiniband/hw/mthca/mthca_profile.c |4 ++-- 5 files changed, 26 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.c b/drivers/infiniband/hw/mthca/mthca_cmd.c index 6d55f9d..8c2ed99 100644 --- a/drivers/infiniband/hw/mthca/mthca_cmd.c +++ b/drivers/infiniband/hw/mthca/mthca_cmd.c @@ -1059,7 +1059,7 @@ int mthca_QUERY_DEV_LIM(struct mthca_dev *dev, MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_MTT_OFFSET); if (mthca_is_memfree(dev)) dev_lim-reserved_mtts = ALIGN((1 (field 4)) * sizeof(u64), - MTHCA_MTT_SEG_SIZE) / MTHCA_MTT_SEG_SIZE; + dev-limits.mtt_seg_size) / dev-limits.mtt_seg_size; else dev_lim-reserved_mtts = 1 (field 4); MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_MRW_SZ_OFFSET); diff --git a/drivers/infiniband/hw/mthca/mthca_dev.h b/drivers/infiniband/hw/mthca/mthca_dev.h index 2525901..9ef611f 100644 --- a/drivers/infiniband/hw/mthca/mthca_dev.h +++ b/drivers/infiniband/hw/mthca/mthca_dev.h @@ -159,6 +159,7 @@ struct mthca_limits { int reserved_eqs; int num_mpts; int num_mtt_segs; + int mtt_seg_size; int fmr_reserved_mtts; int reserved_mtts; int reserved_mrws; diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c index 1d83cf7..13da9f1 100644 --- a/drivers/infiniband/hw/mthca/mthca_main.c +++ b/drivers/infiniband/hw/mthca/mthca_main.c @@ -125,6 +125,10 @@ module_param_named(fmr_reserved_mtts, hca_profile.fmr_reserved_mtts, int, 0444); MODULE_PARM_DESC(fmr_reserved_mtts, number of memory translation table segments reserved for FMR); +static int log_mtts_per_seg = ilog2(MTHCA_MTT_SEG_SIZE / 8); +module_param_named(log_mtts_per_seg, log_mtts_per_seg, int, 0444); +MODULE_PARM_DESC(log_mtts_per_seg, Log2 number of MTT entries per segment (1-5)); + static char mthca_version[] __devinitdata = DRV_NAME : Mellanox InfiniBand HCA driver v DRV_VERSION ( DRV_RELDATE )\n; @@ -162,6 +166,7 @@ static int mthca_dev_lim(struct mthca_dev *mdev, struct mthca_dev_lim *dev_lim) int err; u8 status; + mdev-limits.mtt_seg_size = (1 log_mtts_per_seg) * 8; err = mthca_QUERY_DEV_LIM(mdev, dev_lim, status); if (err) { mthca_err(mdev, QUERY_DEV_LIM command failed, aborting.\n); @@ -460,11 +465,11 @@ static int mthca_init_icm(struct mthca_dev *mdev, } /* CPU writes to non-reserved MTTs, while HCA might DMA to reserved mtts */ - mdev-limits.reserved_mtts = ALIGN(mdev-limits.reserved_mtts * MTHCA_MTT_SEG_SIZE, - dma_get_cache_alignment()) / MTHCA_MTT_SEG_SIZE; + mdev-limits.reserved_mtts = ALIGN(mdev-limits.reserved_mtts * mdev-limits.mtt_seg_size, + dma_get_cache_alignment()) / mdev-limits.mtt_seg_size; mdev-mr_table.mtt_table = mthca_alloc_icm_table(mdev, init_hca-mtt_base, -MTHCA_MTT_SEG_SIZE, + mdev-limits.mtt_seg_size, mdev-limits.num_mtt_segs, mdev-limits.reserved_mtts, 1, 0); @@ -1315,6 +1320,12 @@ static void __init mthca_validate_profile(void) printk(KERN_WARNING PFX Corrected fmr_reserved_mtts to %d.\n, hca_profile.fmr_reserved_mtts); } + + if ((log_mtts_per_seg 1) || (log_mtts_per_seg 5)) { + printk(KERN_WARNING PFX bad log_mtts_per_seg (%d). Using default - %d\n, + log_mtts_per_seg, ilog2(MTHCA_MTT_SEG_SIZE / 8)); + log_mtts_per_seg = ilog2(MTHCA_MTT_SEG_SIZE / 8); + } } static int __init mthca_init(void) diff --git a/drivers/infiniband/hw/mthca/mthca_mr.c b/drivers/infiniband/hw/mthca/mthca_mr.c index 882e6b7..d606edf 100644 --- a/drivers/infiniband/hw/mthca/mthca_mr.c
Re: [ofa-general] OFED1.3.1: soft lockup in completion handler
On Tue, Apr 21, 2009 at 11:08:25AM +0800, Liang Zhen wrote: In our module, we have per-CPU thread and per-CPU waitq, each thread has it's own connection list to poll on, completion handler will dispatch connection to it's scheduler and wake the scheduler. I'm very sure that scheduler doesn't have any heavy slow operation with holding ibs_lock, and I never got this problem on system with = 8 cores. Looks like completion handler is always run on the same core, so completion handler race with all other cores on their own waitq and very likely to get soft lockup... Where is the other place that you acquire conn-ibc_sched-ibs_lock? Is it in the per CPU thread? Maybe you should try to decrease the time when the lock is acquired at the thread. Can you send all references to the code aquiring the lock? I've tried to turn off irqbalancer and set /proc/irq/.../smp_affinity for more cores, but changed nothing and still soft lockup. After I installed ofed1.4.1 and create CQ with ib_create_cq(comp_vector), the problem is gone and get really good performance. The problem now is, seems ofed1.4.1:mlx4 is the only driver can really support multiple completion vectors, but we can't expect all customers to have the same environment... Is there only other possible way to resolve this? Thanks Liang ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[ofa-general] Re: [PATCH v3] mlx4_ib: Optimize hugetlab pages support
On Mon, Apr 13, 2009 at 10:19:08PM +0300, Yossi Etigin wrote: I see. That surprises me... could you also print vma-vm_start and vma-vm_end? I would have expected the vma to be streched to fill a full huge page and return the number of regular pages fitting in it. Eli Cohen wrote: On Mon, Apr 06, 2009 at 08:49:43PM +0300, Yossi Etigin wrote: I don't understand - if all area is huge pages, it does not mean that it fills full huge pages - I can have just 4096 bytes in huge page memory and umem-hugetlb will remain 1, right? You may call ib_umem_get() with a fraction of a huge page but I expect the number of pages returned from get_user_pages() will fill up a huge page. Can you check that with the mckey test you were using? The number of pages is 1. I got this in dmesg with the modified mckey (see the last line): umem: addr=508000 size=1024 hugetlb=0 npages=1 umem: addr=50a000 size=4096 hugetlb=0 npages=1 umem: addr=50c000 size=4352 hugetlb=0 npages=2 umem: addr=50f000 size=4096 hugetlb=0 npages=1 umem: addr=2ac0 size=140 hugetlb=1 npages=1 After applying this to umem.c: --- ofa_kernel-1.4.1/drivers/infiniband/core/umem.c 2009-04-13 22:15:19.0 +0300 +++ ofa_kernel-1.4.1.patched/drivers/infiniband/core/umem.c 2009-04-13 22:09:36.0 +0300 @@ -137,6 +137,7 @@ int ret; int off; int i; + int ntotalpages; DEFINE_DMA_ATTRS(attrs); if (dmasync) @@ -196,6 +197,7 @@ cur_base = addr PAGE_MASK; ret = 0; + ntotalpages = 0; while (npages) { ret = get_user_pages(current, current-mm, cur_base, min_t(unsigned long, npages, @@ -226,6 +228,7 @@ !is_vm_hugetlb_page(vma_list[i + off])) umem-hugetlb = 0; sg_set_page(chunk-page_list[i], page_list[i + off], PAGE_SIZE, 0); + ntotalpages++; } chunk-nmap = ib_dma_map_sg_attrs(context-device, @@ -254,8 +257,11 @@ if (ret 0) { __ib_umem_release(context-device, umem, 0); kfree(umem); - } else + } else { current-mm-locked_vm = locked; + printk(KERN_DEBUG umem: addr=%lx size=%ld hugetlb=%d npages=%d\n, +addr, size, umem-hugetlb, ntotalpages); + } up_write(current-mm-mmap_sem); if (vma_list) ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[ofa-general] Re: [PATCH v3] mlx4_ib: Optimize hugetlab pages support
On Mon, Apr 06, 2009 at 08:49:43PM +0300, Yossi Etigin wrote: I don't understand - if all area is huge pages, it does not mean that it fills full huge pages - I can have just 4096 bytes in huge page memory and umem-hugetlb will remain 1, right? You may call ib_umem_get() with a fraction of a huge page but I expect the number of pages returned from get_user_pages() will fill up a huge page. Can you check that with the mckey test you were using? ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[ofa-general] Re: [PATCH v3] mlx4_ib: Optimize hugetlab pages support
On Thu, Apr 02, 2009 at 11:31:42PM +0300, Yossi Etigin wrote: Hi Eli, I've placed a printk in the new hugetlb function to print n and j. While running the mckey test (attached in bugzilla), I got j=0, n=1. Why do you say that the number of pages must cover HUGE_PAGES? In ib_umem_get, hugetlb is set to 0 only if any of the pages is not-hugetlb - otherwise it's 1. Am I missing something? Only if ALL of the registered area is huge pages, then umem-hugetlb will remain 1; otherwise it is cleared and we don't execute handle_hugetlb_user_mr(). In case we do execute handle_hugetlb_user_mr(), I expect the number of PAGE_SIZEed pages to fill full huge pages. ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[ofa-general] Re: [ewg] Re: [PATCH v3] mlx4_ib: Optimize hugetlab pages support
On Mon, Mar 30, 2009 at 06:49:37PM +0300, Yossi Etigin wrote: Yossi, I wouldn't expect this to matter since handle_hugetlb_user_mr() only gets called when the memory is huge pages which means the number of PAGE_SIZE pages cover full HUGE_PAGES. You mention in Bugzilla an mckey test but I don't know this test. Can you send how to obatain the test and instructions how to build it and run it? + + if (cur_size) { + arr[j++] = cur_addr; + } + http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[ofa-general] Re: [ewg] Re: [PATCH v3] mlx4_ib: Optimize hugetlab pages support
On Thu, Apr 02, 2009 at 11:47:26AM +0300, Or Gerlitz wrote: mckey is installed with librdmac-utils, has man page, etc. Its source is under the examples directory of the librdmacm src tree, you can clone Sean's librdmacm git from ofa to get it - OK? Thanks Or. I will check this. ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
Re: [ofa-general] Re: [ewg] Re: [PATCH] IB/ipoib: set neigh-dgid upon ipoib_neigh creation
On Sun, Mar 29, 2009 at 12:07:09PM +0300, Moni Shoua wrote: Eli Cohen wrote: On Wed, Mar 25, 2009 at 09:20:19PM +0200, Yossi Etigin wrote: Wouldn't this patch break the fast bond+SM failover recovery? Up till now, in such case neigh-dgid was the old gid until path record query was successful, and that caused to retry the path query until it's successful. With that patch, a new path query will not be issued until the next arp refresh because neigh-dgid will be set to a valid value right after the first packet. Looks to me like How about keeping the memcpy in path_rec-completion, and also adding it to ib_mcast_send (where ipoib neigh is created)? I did not like the idea that a unicast ipoib_neigh can be created and destroyed till the path is resolved so I preferred to initialize at creation time. How about the following patch to solve the problem of failure to resolve a path: From 4e6e3dd0186803f7547f5e4c233d7525dfcdaec6 Mon Sep 17 00:00:00 2001 From: Eli Cohen e...@mellanox.co.il Date: Wed, 25 Mar 2009 16:22:28 +0200 Subject: [PATCH] IB/ipoib: set neigh-dgid upon ipoib_neigh creation If we fail to do that, there can be superfluous calls to ipoib_neigh_free()/ipoib_neigh_alloc() due to the following if statement in ipoib_start_xmit(): if (unlikely((memcmp(neigh-dgid.raw, skb-dst-neighbour-ha + 4, sizeof(union ib_gid))) || In the case of a unicast GID, it can happen until the path is resolved and is not so severe. In the case of a multicast address, neigh-dgid is never even updated, so the corresponding ipoib_neigh will be destroyed and created on every multicast packet sent. Signed-off-by: Eli Cohen e...@mellanox.co.il --- drivers/infiniband/ulp/ipoib/ipoib_main.c |7 --- 1 files changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 0bd2a4f..b680483 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -460,8 +460,6 @@ static void path_rec_completion(int status, } kref_get(path-ah-ref); neigh-ah = path-ah; - memcpy(neigh-dgid.raw, path-pathrec.dgid.raw, - sizeof(union ib_gid)); if (ipoib_cm_enabled(dev, neigh-neighbour)) { if (!ipoib_cm_get(neigh)) @@ -481,7 +479,9 @@ static void path_rec_completion(int status, __skb_queue_tail(skqueue, skb); } path-valid = 1; - } + } else + list_for_each_entry_safe(neigh, tn, path-neigh_list, list) + memset(neigh-dgid, 0, sizeof neigh-dgid); path-query = NULL; complete(path-done); @@ -878,6 +878,7 @@ struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neighbour, neigh-neighbour = neighbour; neigh-dev = dev; *to_ipoib_neigh(neighbour) = neigh; + memcpy(neigh-dgid.raw, neighbour-ha + 4, 16); skb_queue_head_init(neigh-queue); ipoib_cm_set(neigh, NULL); This might work but still, I don't understand what this patch fixes. What was the initial problem? The problem was that for multicast sends, the corresponding ipoib_neigh would be created and destroyed for every packet because this if will fail: if (unlikely((memcmp(neigh-dgid.raw, skb-dst-neighbour-ha + 4, sizeof(union ib_gid))) || There is a less severe problem with unicast flow where the ipoib_neigh object might be created and destroyed until the path is resolved. I agree with Yossi's early comments that the patch breaks the fast fail over we've worked on in the past. ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[ofa-general] Re: [ewg] Re: [PATCH] IB/ipoib: set neigh-dgid upon ipoib_neigh creation
On Wed, Mar 25, 2009 at 09:20:19PM +0200, Yossi Etigin wrote: Wouldn't this patch break the fast bond+SM failover recovery? Up till now, in such case neigh-dgid was the old gid until path record query was successful, and that caused to retry the path query until it's successful. With that patch, a new path query will not be issued until the next arp refresh because neigh-dgid will be set to a valid value right after the first packet. Looks to me like How about keeping the memcpy in path_rec-completion, and also adding it to ib_mcast_send (where ipoib neigh is created)? I did not like the idea that a unicast ipoib_neigh can be created and destroyed till the path is resolved so I preferred to initialize at creation time. How about the following patch to solve the problem of failure to resolve a path: From 4e6e3dd0186803f7547f5e4c233d7525dfcdaec6 Mon Sep 17 00:00:00 2001 From: Eli Cohen e...@mellanox.co.il Date: Wed, 25 Mar 2009 16:22:28 +0200 Subject: [PATCH] IB/ipoib: set neigh-dgid upon ipoib_neigh creation If we fail to do that, there can be superfluous calls to ipoib_neigh_free()/ipoib_neigh_alloc() due to the following if statement in ipoib_start_xmit(): if (unlikely((memcmp(neigh-dgid.raw, skb-dst-neighbour-ha + 4, sizeof(union ib_gid))) || In the case of a unicast GID, it can happen until the path is resolved and is not so severe. In the case of a multicast address, neigh-dgid is never even updated, so the corresponding ipoib_neigh will be destroyed and created on every multicast packet sent. Signed-off-by: Eli Cohen e...@mellanox.co.il --- drivers/infiniband/ulp/ipoib/ipoib_main.c |7 --- 1 files changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 0bd2a4f..b680483 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -460,8 +460,6 @@ static void path_rec_completion(int status, } kref_get(path-ah-ref); neigh-ah = path-ah; - memcpy(neigh-dgid.raw, path-pathrec.dgid.raw, - sizeof(union ib_gid)); if (ipoib_cm_enabled(dev, neigh-neighbour)) { if (!ipoib_cm_get(neigh)) @@ -481,7 +479,9 @@ static void path_rec_completion(int status, __skb_queue_tail(skqueue, skb); } path-valid = 1; - } + } else + list_for_each_entry_safe(neigh, tn, path-neigh_list, list) + memset(neigh-dgid, 0, sizeof neigh-dgid); path-query = NULL; complete(path-done); @@ -878,6 +878,7 @@ struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neighbour, neigh-neighbour = neighbour; neigh-dev = dev; *to_ipoib_neigh(neighbour) = neigh; + memcpy(neigh-dgid.raw, neighbour-ha + 4, 16); skb_queue_head_init(neigh-queue); ipoib_cm_set(neigh, NULL); -- 1.6.2.1 ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[ofa-general] resolving ipv6 addresses - help needed
Hi, I am trying to resolve IPv6 addresses (that is, obtaining the MAC address of the interface bearing the IP addresses). I am using rdma_resolve_ip() defined in core/addr.c but I don't get consistent results. I get something like this: fe80::202:c9ff:fe00:1341 dev eth1 FAILED where I would expect to get address to be rachable through eth2. If I run: ping6 -I eth2 fe80::202:c9ff:fe00:1341 then I'd get the address resolved but after some time, when I attempt to resolve again, the operation fails. Is this API tested and reliable? Maybe I'm doing anything wrong... Thanks, Eli ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
Re: [ofa-general] resolving ipv6 addresses - help needed
On Thu, Mar 05, 2009 at 09:29:48AM -0800, Sean Hefty wrote: I didn't quite follow this last sentence. Are you saying that it only works sometimes? Only the first time? After I run ping6 from the command line, the remote ipv6 address gets added to the cache and resolution starts working. After some time (about 2 sec or so), I try to resolve the same address again but the operation fails. What kernel version are you running? 2.6.29-rc6 ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[ofa-general] Too many calls to mlx4_CLOSE_PORT()?
Roland, browsing the code, I see that mlx4_CLOSE_PORT() gets called from, seemingly, too many places. I would expect it to get called only from __mlx4_ib_modify_qp() when QP0 gets closed, but mlx4_ib_remove() calls it too even though it is soon to be called by __mlx4_ib_modify_qp() due to destroying the MAD QP. It also gets called from mlx4_remove_one() even though by the time this function gets called, the port is already closed. Is there a reason for that? ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[ofa-general] Re: [ewg] iscsi initiator ipoib+lro crash on upstream kernel
Thanks! On Thu, Feb 19, 2009 at 9:40 PM, Or Gerlitz or.gerl...@gmail.com wrote: On Thu, Feb 19, 2009 at 6:55 PM, Eli Cohen e...@dev.mellanox.co.il wrote: I have encountered a kernel crash when running a iSCSI initiator on IPoIB configured with LRO (if LRO is off it does not happen). This was seen first on Sles10sp2 but then I verified it happens on 2.6.28.2 too. Eli, This is a known issue (http://bugzilla.kernel.org/show_bug.cgi?id=11804) a fix was submitted upstream and would be included in the next kernel. Or. ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[ofa-general] iscsi initiator ipoib+lro crash on upstream kernel
Hi, I have encountered a kernel crash when running a iSCSI initiator on IPoIB configured with LRO (if LRO is off it does not happen). This was seen first on Sles10sp2 but then I verified it happens on 2.6.28.2 too. Bellow is a dump of the crash info from 2.6.28.2: sd 2:0:0:1: Attached scsi generic sg3 type 0 BUG: unable to handle kernel NULL pointer dereference at 0004 IP: [803c50a4] skb_seq_read+0xfb/0x1a1 PGD 227115067 PUD 0 Oops: [#1] SMP last sysfs file: /sys/devices/platform/host2/session2/target2:0:0/2:0:0:1/type CPU 2 Modules linked in: ib_uverbs ib_umad mlx4_ib nfs lockd nfs_acl mlx4_core sunrpc ib_mthca ib_ipoib ib_cm ib_sa ib_mad ib_core inet_lro ipv6 button battery a] Pid: 0, comm: swapper Not tainted 2.6.28.2-debug #3 RIP: 0010:[803c50a4] [803c50a4] skb_seq_read+0xfb/0x1a1 RSP: 0018:88022f0e3b00 EFLAGS: 00010246 RAX: 88022dd44f38 RBX: 88022f0e3b30 RCX: RDX: RSI: 88022f0e3b88 RDI: 07d4 RBP: 07d4 R08: 880220476d30 R09: 085c R10: 000b0038 R11: a0126115 R12: 88022f0e3b88 R13: 88022d974d38 R14: 07d4 R15: 07d4 FS: () GS:88022f07bb50() knlGS: CS: 0010 DS: 0018 ES: 0018 CR0: 8005003b CR2: 0004 CR3: 0002271c2000 CR4: 06e0 DR0: DR1: DR2: DR3: DR6: 0ff0 DR7: 0400 Process swapper (pid: 0, threadinfo 88022f0da000, task 88022f0a4050) Stack: 88022d974fa0 88022f0e3b30 07d4 a01261fe 88022d974f80 880220418068 085c 07d4 880220476d30 88022dd44f38 88022dd44e58 Call Trace: IRQ 0 [a01261fe] ? iscsi_tcp_recv+0x64/0x39b [iscsi_tcp] [803f0d0f] ? ip_queue_xmit+0x2aa/0x2fd [803f60fd] ? tcp_read_sock+0x97/0x212 [a012619a] ? iscsi_tcp_recv+0x0/0x39b [iscsi_tcp] [a012615d] ? iscsi_tcp_data_ready+0x48/0x85 [iscsi_tcp] [803ff119] ? tcp_rcv_established+0x4c0/0x567 [804042f8] ? tcp_v4_do_rcv+0x2c/0x1c8 [80405fb9] ? tcp_v4_rcv+0x630/0x683 [803c6552] ? skb_release_head_state+0x60/0x8f [803ecb9f] ? ip_local_deliver_finish+0xda/0x197 [803ecaab] ? ip_rcv_finish+0x32f/0x349 [a024e42d] ? lro_flush+0x159/0x17e [inet_lro] [a024eb2e] ? __lro_proc_skb+0x1ca/0x1ed [inet_lro] [80221e28] ? swiotlb_map_single_phys+0x0/0x12 [a024eb69] ? lro_receive_skb+0x18/0x3e [inet_lro] [a0299582] ? ipoib_ib_handle_rx_wc+0x1ed/0x22b [ib_ipoib] [a0299e97] ? ipoib_poll+0x9c/0x173 [ib_ipoib] [803ce1d0] ? net_rx_action+0x9d/0x175 [80239ffb] ? __do_softirq+0x7a/0x13d [8020cf4c] ? call_softirq+0x1c/0x28 [8020df5d] ? do_softirq+0x2c/0x68 [8020e05b] ? do_IRQ+0xc2/0xdf [8020c206] ? ret_from_intr+0x0/0xa EOI 0 [80212464] ? mwait_idle+0x41/0x44 [8020abca] ? cpu_idle+0x40/0x5e Code: ff 88 48 e0 ff ff 48 c7 43 20 00 00 00 00 ff 43 08 8b 46 0c 01 43 0c 48 8b 43 18 8b 4b 08 8b 90 b4 00 00 00 48 03 90 b8 00 00 00 0f b7 42 04 39 c1 RIP [803c50a4] skb_seq_read+0xfb/0x1a1 RSP 88022f0e3b00 CR2: 0004 Kernel panic - not syncing: Fatal exception in interrupt When I looked at this on sles10 I was able to verify that the problem was with (see bellow where this comes from) st-cur_skb-next equals 0x: if (st-cur_skb-next) { st-cur_skb = st-cur_skb-next; === this where I see the problem st-frag_idx = 0; goto next_skb; } else if (st-root_skb == st-cur_skb ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
Re: [ofa-general] Re: [ewg] [PATCH] ib_core: save process's virtual address in struct ib_umem
On Mon, Jan 26, 2009 at 10:26:25AM -0800, Roland Dreier wrote: It is has to be saved either at the low level driver's mr object, e.g. struct mlx4_ib_mr, or at a common place like struct ib_umem. Do you prefer that it will be saved in struct mlx4_ib_mr? I don't see why it has to be saved anywhere? The only place you use umem-address is in handle_hugetlb_usermr(), and you could just as easily pass in start directly as a parameter (since mlx4_ib_reg_user_mr() has that value in a parameter anyway). Sorry for the delayed response. I see... you're right - no need to stick the address into struct ib_umem. Following this email is a new patch for mlx4_ib only. I excluded support for both powerpc and ia64 since I could not find a way to get HPAGE_SIZE (or HPAGE_SHIFT) for them. ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
Re: [ofa-general] Re: [ewg] [PATCH] ib_core: save process's virtual address in struct ib_umem
On Thu, Jan 29, 2009 at 07:33:22AM -0800, Roland Dreier wrote: I see... you're right - no need to stick the address into struct ib_umem. Following this email is a new patch for mlx4_ib only. I excluded support for both powerpc and ia64 since I could not find a way to get HPAGE_SIZE (or HPAGE_SHIFT) for them. #include asm/page.h ? It does not help. The problem with powerpc is that HPAGE_SHIFT is an unexported variable and for ia64 it's hpage_shift. ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[ofa-general] Re: [ewg] [PATCH] ib_core: save process's virtual address in struct ib_umem
On Mon, Jan 26, 2009 at 09:59:26AM -0800, Roland Dreier wrote: seems like a really strange thing to do: + umem-address = addr; this value addr is coming from the low-level driver, so I'm not clear why we need to stick it into umem? What am I missing? It is has to be saved either at the low level driver's mr object, e.g. struct mlx4_ib_mr, or at a common place like struct ib_umem. Do you prefer that it will be saved in struct mlx4_ib_mr? ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[ofa-general] Re: [ewg] Re: [PATCH v1] mlx4_ib: Optimize hugetlab pages support
On Thu, Jan 22, 2009 at 09:07:41PM -0800, Roland Dreier wrote: seems this might underestimate by 1 if the region doesn't start/end on a huge-page aligned boundary (but we would still want to use big pages to register it). Looks like we must pass the virtual address through struct ib_umem to the low level driver. I think we could avoid the uninitialized_var() stuff and having restart at all by just doing cur_size = 0 at the start of the loop, and then instead of if (restart) just test if cur_size is 0. initializing cur_size and eliminating restart works fine but cur_addr still needs this trick. I am sending two patches, one for ib_core and one for mlx4. ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[ofa-general] [PATCH] ib_core: save process's virtual address in struct ib_umem
add address field to struct ib_umem so low level drivers will have this information which may be needed in order to correctly calculate the number of huge pages. Signed-off-by: Eli Cohen e...@mellanox.co.il --- drivers/infiniband/core/umem.c |1 + include/rdma/ib_umem.h |1 + 2 files changed, 2 insertions(+), 0 deletions(-) diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 6f7c096..4c076c4 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -102,6 +102,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, umem-context = context; umem-length= size; umem-offset= addr ~PAGE_MASK; + umem-address = addr; umem-page_size = PAGE_SIZE; /* * We ask for writable memory if any access flags other than diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index 9ee0d2e..c385bb6 100644 --- a/include/rdma/ib_umem.h +++ b/include/rdma/ib_umem.h @@ -43,6 +43,7 @@ struct ib_umem { struct ib_ucontext *context; size_t length; int offset; + unsigned long address; int page_size; int writable; int hugetlb; -- 1.6.1 ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[ofa-general] [PATCH v1] mlx4_ib: Optimize hugetlab pages support
Since Linux may not merge adjacent pages into a single scatter entry through calls to dma_map_sg(), we check the special case of hugetlb pages which are likely to be mapped to coniguous dma addresses and if they are, take advantage of this. This will result in a significantly lower number of MTT segments used for registering hugetlb memory regions. Signed-off-by: Eli Cohen e...@mellanox.co.il --- In this version I also took care of the case where the kernel is compiled without hugetlb support. drivers/infiniband/hw/mlx4/mr.c | 86 ++- 1 files changed, 75 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c index 8e4d26d..4c7a5bf 100644 --- a/drivers/infiniband/hw/mlx4/mr.c +++ b/drivers/infiniband/hw/mlx4/mr.c @@ -119,6 +119,68 @@ out: return err; } +static int handle_hugetlb_user_mr(struct ib_pd *pd, struct mlx4_ib_mr *mr, + u64 virt_addr, int access_flags) +{ +#ifdef CONFIG_HUGETLB_PAGE + struct mlx4_ib_dev *dev = to_mdev(pd-device); + struct ib_umem_chunk *chunk; + unsigned dsize; + dma_addr_t daddr; + unsigned uninitialized_var(cur_size); + dma_addr_t uninitialized_var(cur_addr); + int restart; + int n; + struct ib_umem *umem = mr-umem; + u64 *arr; + int err = 0; + int i; + int j = 0; + +n = PAGE_ALIGN(umem-length + umem-offset) HPAGE_SHIFT; + arr = kmalloc(n * sizeof *arr, GFP_KERNEL); + if (!arr) + return -ENOMEM; + + restart = 1; + list_for_each_entry(chunk, umem-chunk_list, list) + for (i = 0; i chunk-nmap; ++i) { + daddr = sg_dma_address(chunk-page_list[i]); + dsize = sg_dma_len(chunk-page_list[i]); + if (restart) { + cur_addr = daddr; + cur_size = dsize; + restart = 0; + } else if (cur_addr + cur_size != daddr) { + err = -EINVAL; + goto out; + } else +cur_size += dsize; + + if (cur_size HPAGE_SIZE) { + err = -EINVAL; + goto out; + } else if (cur_size == HPAGE_SIZE) { + restart = 1; + arr[j++] = cur_addr; + } + } + + err = mlx4_mr_alloc(dev-dev, to_mpd(pd)-pdn, virt_addr, umem-length, + convert_access(access_flags), n, HPAGE_SHIFT, mr-mmr); + if (err) + goto out; + +err = mlx4_write_mtt(dev-dev, mr-mmr.mtt, 0, n, arr); + +out: + kfree(arr); + return err; +#else + return -ENOSYS; +#endif +} + struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_udata *udata) @@ -140,17 +202,19 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, goto err_free; } - n = ib_umem_page_count(mr-umem); - shift = ilog2(mr-umem-page_size); - - err = mlx4_mr_alloc(dev-dev, to_mpd(pd)-pdn, virt_addr, length, - convert_access(access_flags), n, shift, mr-mmr); - if (err) - goto err_umem; - - err = mlx4_ib_umem_write_mtt(dev, mr-mmr.mtt, mr-umem); - if (err) - goto err_mr; + if (!mr-umem-hugetlb || handle_hugetlb_user_mr(pd, mr, virt_addr, access_flags)) { + n = ib_umem_page_count(mr-umem); + shift = ilog2(mr-umem-page_size); + + err = mlx4_mr_alloc(dev-dev, to_mpd(pd)-pdn, virt_addr, length, + convert_access(access_flags), n, shift, mr-mmr); + if (err) + goto err_umem; + + err = mlx4_ib_umem_write_mtt(dev, mr-mmr.mtt, mr-umem); + if (err) + goto err_mr; + } err = mlx4_mr_enable(dev-dev, mr-mmr); if (err) -- 1.6.0.5 ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[ofa-general] [PATCH] mlx4_ib: Optimize hugetlab pages support
Since Linux does not merge adjacent pages into a single scatter entry through calls to dma_map_sg(), we do this for huge pages which are guaranteed to be comprised of adjacent natural pages of size PAGE_SIZE. This will result in a significantly lower number of MTT segments used for registering hugetlb memory regions. Signed-off-by: Eli Cohen e...@mellanox.co.il --- I tried this patch and improves memory scalability. Without it, increasing the amount of memory used cause caused decrease in throughput. With this patch there was no drop at all - I used a test based on MPI with 2 processes. drivers/infiniband/hw/mlx4/mr.c | 67 ++ 1 files changed, 60 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c index 8e4d26d..641ea96 100644 --- a/drivers/infiniband/hw/mlx4/mr.c +++ b/drivers/infiniband/hw/mlx4/mr.c @@ -119,6 +119,38 @@ out: return err; } +int mlx4_ib_umem_write_huge_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt, + struct ib_umem *umem, int nhpages) +{ + struct ib_umem_chunk *chunk; + int j, i, k; + dma_addr_t *arr; + int err; + + arr = kmalloc(nhpages * sizeof *arr, GFP_KERNEL); + if (!arr) + return -ENOMEM; + + i = 0; + k = 0; + list_for_each_entry(chunk, umem-chunk_list, list) + for (j = 0; j chunk-nmap; ++j, ++k) { + if (!(k ((1 (HPAGE_SHIFT - PAGE_SHIFT)) - 1))) { + if (sg_dma_len(chunk-page_list[j]) != PAGE_SIZE) { + err = -EAGAIN; + goto out; + } + arr[i++] = sg_dma_address(chunk-page_list[j]); + } + } + + err = mlx4_write_mtt(dev-dev, mtt, 0, nhpages, arr); + +out: + kfree(arr); + return err; +} + struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_udata *udata) @@ -128,6 +160,8 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, int shift; int err; int n; + int nhuge; + int shift_huge; mr = kmalloc(sizeof *mr, GFP_KERNEL); if (!mr) @@ -142,15 +176,34 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, n = ib_umem_page_count(mr-umem); shift = ilog2(mr-umem-page_size); - - err = mlx4_mr_alloc(dev-dev, to_mpd(pd)-pdn, virt_addr, length, + if (mr-umem-hugetlb) { + nhuge = ALIGN(n shift, HPAGE_SIZE) HPAGE_SHIFT; + shift_huge = HPAGE_SHIFT; + err = mlx4_mr_alloc(dev-dev, to_mpd(pd)-pdn, virt_addr, length, + convert_access(access_flags), nhuge, shift_huge, mr-mmr); + if (err) + goto err_umem; + + err = mlx4_ib_umem_write_huge_mtt(dev, mr-mmr.mtt, mr-umem, nhuge); + if (err) { + if (err != -EAGAIN) + goto err_mr; + else { + mlx4_mr_free(to_mdev(pd-device)-dev, mr-mmr); + goto regular_pages; + } + } + } else { +regular_pages: + err = mlx4_mr_alloc(dev-dev, to_mpd(pd)-pdn, virt_addr, length, convert_access(access_flags), n, shift, mr-mmr); - if (err) - goto err_umem; + if (err) + goto err_umem; - err = mlx4_ib_umem_write_mtt(dev, mr-mmr.mtt, mr-umem); - if (err) - goto err_mr; + err = mlx4_ib_umem_write_mtt(dev, mr-mmr.mtt, mr-umem); + if (err) + goto err_mr; + } err = mlx4_mr_enable(dev-dev, mr-mmr); if (err) -- 1.6.0.5 ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
Re: [ofa-general] Re: [PATCH] IPoIB: fix race when manipulating mcast SKBs queue
On Wed, Dec 17, 2008 at 11:12:39AM -0800, Roland Dreier wrote: I don't see why this would be required. When ipoib_mcast_free() runs, the mcast structure has been removed from all lists and I don't see how any other context could simultaneously be adding packets to pkt_queue. What is the race that you think this fixes? I missed the fact that mcasts are moved to remove list so I guess I there is another problem out there. ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
[ofa-general] [PATCH] IPoIB: fix race when manipulating mcast SKBs queue
ipoib_mcast_free() dequeues SKBs pending on the pkt_queue but needs to do that with netif_tx_lock_bh() acquired. Signed-off-by: Eli Cohen e...@mellanox.co.il --- I saw the following bug appear in ofed 1.4 on RHAS5.2 but have not yet had the chance to verify that this patch fixes this particular problem but I think this patch is valid anyway: Dec 13 04:57:26 mtilab17 kernel: ib0: dev_queue_xmit failed to requeue packet Dec 13 04:57:36 mtilab17 kernel: ib0: timing out; 63 sends not completed Dec 13 04:57:36 mtilab17 kernel: Attempt to release alive inet socket 8102621d3680 Dec 13 04:57:36 mtilab17 kernel: Attempt to release alive inet socket 81026ba4b0c0 Dec 13 04:57:36 mtilab17 kernel: BUG: warning at include/net/dst.h:153/dst_release() (Tainted: G ) Dec 13 04:57:36 mtilab17 kernel: Dec 13 04:57:36 mtilab17 kernel: Call Trace: Dec 13 04:57:36 mtilab17 kernel: IRQ [800288ef] __kfree_skb+0x47/0x110 Dec 13 04:57:36 mtilab17 kernel: [885d69ce] :ib_ipoib:ipoib_cm_handle_tx_wc+0xc2/0x228 Dec 13 04:57:36 mtilab17 kernel: [885cfae2] :ib_ipoib:ipoib_poll+0xaa/0x19e Dec 13 04:57:36 mtilab17 kernel: [8000c54c] net_rx_action+0xa4/0x1a4 Dec 13 04:57:36 mtilab17 kernel: [8832da0a] :mlx4_core:poll_catas+0x0/0x137 --- drivers/infiniband/ulp/ipoib/ipoib_multicast.c |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index d9d1223..dd320d5 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -94,12 +94,12 @@ static void ipoib_mcast_free(struct ipoib_mcast *mcast) if (mcast-ah) ipoib_put_ah(mcast-ah); + netif_tx_lock_bh(dev); while (!skb_queue_empty(mcast-pkt_queue)) { ++tx_dropped; dev_kfree_skb_any(skb_dequeue(mcast-pkt_queue)); } - netif_tx_lock_bh(dev); dev-stats.tx_dropped += tx_dropped; netif_tx_unlock_bh(dev); -- 1.6.0.5 ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
Re: [ofa-general] [PATCH] ipoib: null tx/rx_ring skb pointers on free
On Wed, Nov 05, 2008 at 05:23:07PM -0800, [EMAIL PROTECTED] wrote: Hi Arthur, looking a the patch I don't understand why it should fix the problem you're seeing. I suspect we may be hiding the problem. diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 7b14c2c..8f8650b 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -200,6 +200,7 @@ static void ipoib_cm_free_rx_ring(struct net_device *dev, ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1, rx_ring[i].mapping); dev_kfree_skb_any(rx_ring[i].skb); + rx_ring[i].skb = NULL; } vfree(rx_ring); This is not needed since the ring is being freed. @@ -736,6 +737,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_ if (unlikely(ib_dma_mapping_error(priv-ca, addr))) { ++dev-stats.tx_errors; dev_kfree_skb_any(skb); + tx_req-skb = NULL; return; } Here we will never get completion so why do we need this? @@ -747,6 +749,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_ ++dev-stats.tx_errors; ib_dma_unmap_single(priv-ca, addr, skb-len, DMA_TO_DEVICE); dev_kfree_skb_any(skb); + tx_req-skb = NULL; Also here, we don't get a completion. } else { dev-trans_start = jiffies; ++tx-tx_head; @@ -785,6 +788,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) dev-stats.tx_bytes += tx_req-skb-len; dev_kfree_skb_any(tx_req-skb); + tx_req-skb = NULL; And here we already got the completion so we shouldn't exptect another free of the SKB. netif_tx_lock(dev); @@ -1179,6 +1183,7 @@ timeout: ib_dma_unmap_single(priv-ca, tx_req-mapping, tx_req-skb-len, DMA_TO_DEVICE); dev_kfree_skb_any(tx_req-skb); + tx_req-skb = NULL; and here we're freeing the ring ++p-tx_tail; netif_tx_lock_bh(p-dev); if (unlikely(--priv-tx_outstanding == ipoib_sendq_size 1) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index 28eb6f0..f7e3497 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -383,6 +383,7 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) dev-stats.tx_bytes += tx_req-skb-len; dev_kfree_skb_any(tx_req-skb); + tx_req-skb = NULL; ++priv-tx_tail; if (unlikely(--priv-tx_outstanding == ipoib_sendq_size 1) @@ -572,6 +573,7 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb, if (unlikely(ipoib_dma_map_tx(priv-ca, tx_req))) { ++dev-stats.tx_errors; dev_kfree_skb_any(skb); + tx_req-skb = NULL; return; } @@ -594,6 +596,7 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb, --priv-tx_outstanding; ipoib_dma_unmap_tx(priv-ca, tx_req); dev_kfree_skb_any(skb); + tx_req-skb = NULL; if (netif_queue_stopped(dev)) netif_wake_queue(dev); } else { @@ -833,6 +836,7 @@ int ipoib_ib_dev_stop(struct net_device *dev, int flush) (ipoib_sendq_size - 1)]; ipoib_dma_unmap_tx(priv-ca, tx_req); dev_kfree_skb_any(tx_req-skb); + tx_req-skb = NULL; ++priv-tx_tail; --priv-tx_outstanding; } ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general ___ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general