The following path handles address vectors creation for RDMAoE ports. mlx4 needs the MAC address of the remote node to include it in the WQE of a UD QP or in the QP context of connected QPs. Address resolution is done atomically in the case of a link local address or using service from core/addr.c to do that. mlx4 transport packets were changed too to accomodate for RDMAoE.
Signed-off-by: Eli Cohen <e...@mellanox.co.il> --- drivers/infiniband/hw/mlx4/ah.c | 228 ++++++++++++++++++++++++++----- drivers/infiniband/hw/mlx4/mlx4_ib.h | 30 ++++- drivers/infiniband/hw/mlx4/qp.c | 253 +++++++++++++++++++++++++++------- include/linux/mlx4/device.h | 31 ++++- include/linux/mlx4/qp.h | 8 +- 5 files changed, 463 insertions(+), 87 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c index c75ac94..c994e1f 100644 --- a/drivers/infiniband/hw/mlx4/ah.c +++ b/drivers/infiniband/hw/mlx4/ah.c @@ -31,63 +31,200 @@ */ #include "mlx4_ib.h" +#include <rdma/ib_addr.h> +#include <linux/inet.h> +#include <linux/string.h> -struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr) +static struct rdma_addr_client addr_client; + +struct resolve_ctx { + struct completion done; + int status; +}; + + +static int status2err(int status) { - struct mlx4_dev *dev = to_mdev(pd->device)->dev; - struct mlx4_ib_ah *ah; + return status; /* TBD */ +} - ah = kmalloc(sizeof *ah, GFP_ATOMIC); - if (!ah) - return ERR_PTR(-ENOMEM); +static void resolve_callback(int status, struct sockaddr *src_addr, + struct rdma_dev_addr *addr, void *context) +{ + struct resolve_ctx *ctx = context; - memset(&ah->av, 0, sizeof ah->av); + ctx->status = status; + complete(&ctx->done); +} - ah->av.port_pd = cpu_to_be32(to_mpd(pd)->pdn | (ah_attr->port_num << 24)); - ah->av.g_slid = ah_attr->src_path_bits; - ah->av.dlid = cpu_to_be16(ah_attr->dlid); - if (ah_attr->static_rate) { - ah->av.stat_rate = ah_attr->static_rate + MLX4_STAT_RATE_OFFSET; - while (ah->av.stat_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET && - !(1 << ah->av.stat_rate & dev->caps.stat_rate_support)) - --ah->av.stat_rate; +int mlx4_ib_resolve_grh(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah_attr, + u8 *mac, int *is_mcast) +{ + struct mlx4_ib_rdmaoe *rdmaoe = &dev->rdmaoe; + struct sockaddr_in6 dst = {0}; + struct rdma_dev_addr addr; + struct resolve_ctx ctx; + struct net_device *netdev; + int err = 0; + int ifidx; + + *is_mcast = 0; + spin_lock(&rdmaoe->lock); + netdev = rdmaoe->netdevs[ah_attr->port_num - 1]; + if (!netdev) { + spin_unlock(&rdmaoe->lock); + return -EINVAL; + } + ifidx = netdev->ifindex; + spin_unlock(&rdmaoe->lock); + + init_completion(&ctx.done); + memcpy(dst.sin6_addr.s6_addr, ah_attr->grh.dgid.raw, sizeof ah_attr->grh); + dst.sin6_family = AF_INET6; + dst.sin6_scope_id = ifidx; + if (rdma_link_local_addr(&dst.sin6_addr)) + rdma_get_ll_mac(&dst.sin6_addr, mac); + else if (rdma_is_multicast_addr(&dst.sin6_addr)) { + rdma_get_mcast_mac(&dst.sin6_addr, mac); + *is_mcast = 1; + } else { + err = rdma_resolve_ip(&addr_client, NULL, (struct sockaddr *)&dst, &addr, + 2000, resolve_callback, &ctx); + if (!err) + wait_for_completion(&ctx.done); + else + ctx.status = err; + + err = status2err(ctx.status); } - ah->av.sl_tclass_flowlabel = cpu_to_be32(ah_attr->sl << 28); + + return err; +} + +static struct ib_ah *create_ib_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr, + struct mlx4_ib_ah *ah) +{ + struct mlx4_dev *dev = to_mdev(pd->device)->dev; + + ah->av.ib.port_pd = cpu_to_be32(to_mpd(pd)->pdn | (ah_attr->port_num << 24)); + ah->av.ib.g_slid = ah_attr->src_path_bits; if (ah_attr->ah_flags & IB_AH_GRH) { - ah->av.g_slid |= 0x80; - ah->av.gid_index = ah_attr->grh.sgid_index; - ah->av.hop_limit = ah_attr->grh.hop_limit; - ah->av.sl_tclass_flowlabel |= + ah->av.ib.g_slid |= 0x80; + ah->av.ib.gid_index = ah_attr->grh.sgid_index; + ah->av.ib.hop_limit = ah_attr->grh.hop_limit; + ah->av.ib.sl_tclass_flowlabel |= cpu_to_be32((ah_attr->grh.traffic_class << 20) | ah_attr->grh.flow_label); - memcpy(ah->av.dgid, ah_attr->grh.dgid.raw, 16); + memcpy(ah->av.ib.dgid, ah_attr->grh.dgid.raw, 16); } + ah->av.ib.dlid = cpu_to_be16(ah_attr->dlid); + if (ah_attr->static_rate) { + ah->av.ib.stat_rate = ah_attr->static_rate + MLX4_STAT_RATE_OFFSET; + while (ah->av.ib.stat_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET && + !(1 << ah->av.ib.stat_rate & dev->caps.stat_rate_support)) + --ah->av.ib.stat_rate; + } + ah->av.ib.sl_tclass_flowlabel = cpu_to_be32(ah_attr->sl << 28); + return &ah->ibah; } +static struct ib_ah *create_rdmaoe_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr, + struct mlx4_ib_ah *ah) +{ + struct mlx4_ib_dev *ibdev = to_mdev(pd->device); + struct mlx4_dev *dev = ibdev->dev; + u8 mac[6]; + int err; + int is_mcast; + + err = mlx4_ib_resolve_grh(ibdev, ah_attr, mac, &is_mcast); + if (err) + return ERR_PTR(err); + + memcpy(ah->av.eth.mac_0_1, mac, 2); + memcpy(ah->av.eth.mac_2_5, mac + 2, 4); + ah->av.ib.port_pd = cpu_to_be32(to_mpd(pd)->pdn | (ah_attr->port_num << 24)); + ah->av.ib.g_slid = 0x80; + if (ah_attr->static_rate) { + ah->av.ib.stat_rate = ah_attr->static_rate + MLX4_STAT_RATE_OFFSET; + while (ah->av.ib.stat_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET && + !(1 << ah->av.ib.stat_rate & dev->caps.stat_rate_support)) + --ah->av.ib.stat_rate; + } + + /* + * HW requires multicast LID so we just choose one. + */ + if (is_mcast) + ah->av.ib.dlid = cpu_to_be16(0xc000); + + memcpy(ah->av.ib.dgid, ah_attr->grh.dgid.raw, 16); + ah->av.ib.sl_tclass_flowlabel = cpu_to_be32(ah_attr->sl << 28); + + return &ah->ibah; +} + +struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr) +{ + struct mlx4_ib_ah *ah; + enum ib_port_link_type link_type; + struct ib_ah *ret; + + ah = kzalloc(sizeof *ah, GFP_ATOMIC); + if (!ah) + return ERR_PTR(-ENOMEM); + + link_type = ib_get_port_link_type(pd->device, ah_attr->port_num); + if (link_type == PORT_LINK_ETH) { + if (!(ah_attr->ah_flags & IB_AH_GRH)) { + ret = ERR_PTR(-EINVAL); + goto out; + } else { + /* TBD: need to handle the case when we get called + in an atomic context and there we might sleep. We + don't expect this currently since we're working with + link local addresses which we can translate without + going to sleep */ + ret = create_rdmaoe_ah(pd, ah_attr, ah); + if (IS_ERR(ret)) + goto out; + else + return ret; + } + } else + return create_ib_ah(pd, ah_attr, ah); /* never fails */ + +out: + kfree(ah); + return ret; +} + int mlx4_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr) { struct mlx4_ib_ah *ah = to_mah(ibah); + enum ib_port_link_type lt; + lt = ib_get_port_link_type(ibah->device, ah_attr->port_num); memset(ah_attr, 0, sizeof *ah_attr); - ah_attr->dlid = be16_to_cpu(ah->av.dlid); - ah_attr->sl = be32_to_cpu(ah->av.sl_tclass_flowlabel) >> 28; - ah_attr->port_num = be32_to_cpu(ah->av.port_pd) >> 24; - if (ah->av.stat_rate) - ah_attr->static_rate = ah->av.stat_rate - MLX4_STAT_RATE_OFFSET; - ah_attr->src_path_bits = ah->av.g_slid & 0x7F; + ah_attr->dlid = lt == PORT_LINK_IB ? be16_to_cpu(ah->av.ib.dlid) : 0; + ah_attr->sl = be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28; + ah_attr->port_num = be32_to_cpu(ah->av.ib.port_pd) >> 24; + if (ah->av.ib.stat_rate) + ah_attr->static_rate = ah->av.ib.stat_rate - MLX4_STAT_RATE_OFFSET; + ah_attr->src_path_bits = ah->av.ib.g_slid & 0x7F; if (mlx4_ib_ah_grh_present(ah)) { ah_attr->ah_flags = IB_AH_GRH; ah_attr->grh.traffic_class = - be32_to_cpu(ah->av.sl_tclass_flowlabel) >> 20; + be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20; ah_attr->grh.flow_label = - be32_to_cpu(ah->av.sl_tclass_flowlabel) & 0xfffff; - ah_attr->grh.hop_limit = ah->av.hop_limit; - ah_attr->grh.sgid_index = ah->av.gid_index; - memcpy(ah_attr->grh.dgid.raw, ah->av.dgid, 16); + be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) & 0xfffff; + ah_attr->grh.hop_limit = ah->av.ib.hop_limit; + ah_attr->grh.sgid_index = ah->av.ib.gid_index; + memcpy(ah_attr->grh.dgid.raw, ah->av.ib.dgid, 16); } return 0; @@ -98,3 +235,30 @@ int mlx4_ib_destroy_ah(struct ib_ah *ah) kfree(to_mah(ah)); return 0; } + +int mlx4_ib_get_mac(struct ib_device *device, u8 port, u8 *gid, u8 *mac) +{ + int err; + struct mlx4_ib_dev *ibdev = to_mdev(device); + struct ib_ah_attr ah_attr = { + .port_num = port, + }; + int is_mcast; + + memcpy(ah_attr.grh.dgid.raw, gid, 16); + err = mlx4_ib_resolve_grh(ibdev, &ah_attr, mac, &is_mcast); + if (err) + ERR_PTR(err); + + return 0; +} + +void mlx4_ib_addr_init(void) +{ + rdma_addr_register_client(&addr_client); +} + +void mlx4_ib_addr_cleanup(void) +{ + rdma_addr_unregister_client(&addr_client); +} diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index 8a7dd67..e2d6e62 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -138,6 +138,7 @@ struct mlx4_ib_qp { u8 resp_depth; u8 sq_no_prefetch; u8 state; + int mlx_type; }; struct mlx4_ib_srq { @@ -157,7 +158,15 @@ struct mlx4_ib_srq { struct mlx4_ib_ah { struct ib_ah ibah; - struct mlx4_av av; + union mlx4_ext_av av; +}; + +struct mlx4_ib_rdmaoe { + spinlock_t lock; + struct net_device *netdevs[MLX4_MAX_PORTS]; + int enstate[MLX4_MAX_PORTS]; + int mtu[MLX4_MAX_PORTS]; + struct notifier_block nb; }; struct mlx4_ib_dev { @@ -175,6 +184,8 @@ struct mlx4_ib_dev { spinlock_t sm_lock; struct mutex cap_mask_mutex; + + struct mlx4_ib_rdmaoe rdmaoe; }; static inline struct mlx4_ib_dev *to_mdev(struct ib_device *ibdev) @@ -312,10 +323,25 @@ int mlx4_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, int npages, u64 iova); int mlx4_ib_unmap_fmr(struct list_head *fmr_list); int mlx4_ib_fmr_dealloc(struct ib_fmr *fmr); +void mlx4_ib_addr_init(void); +void mlx4_ib_addr_cleanup(void); + +int mlx4_ib_resolve_grh(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah_attr, + u8 *mac, int *is_mcast); + +int mlx4_ib_get_mac(struct ib_device *device, u8 port, u8 *gid, u8 *mac); static inline int mlx4_ib_ah_grh_present(struct mlx4_ib_ah *ah) { - return !!(ah->av.g_slid & 0x80); + /* + * port number is located at the same place for both IB and Eth + */ + u8 port = (ah->av.ib.port_pd >> 24) & 3; + + if (ib_get_port_link_type(ah->ibah.device, port) == PORT_LINK_ETH) + return 1; + else + return !!(ah->av.ib.g_slid & 0x80); } #endif /* MLX4_IB_H */ diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 20724ae..7a6b765 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -32,6 +32,7 @@ */ #include <linux/log2.h> +#include <linux/netdevice.h> #include <rdma/ib_cache.h> #include <rdma/ib_pack.h> @@ -47,14 +48,21 @@ enum { enum { MLX4_IB_DEFAULT_SCHED_QUEUE = 0x83, - MLX4_IB_DEFAULT_QP0_SCHED_QUEUE = 0x3f + MLX4_IB_DEFAULT_QP0_SCHED_QUEUE = 0x3f, + MLX4_IB_LINK_TYPE_IB = 0, + MLX4_IB_LINK_TYPE_ETH = 1 }; enum { /* * Largest possible UD header: send with GRH and immediate data. + * 4 bytes added to accommodate for eth header instead of lrh */ - MLX4_IB_UD_HEADER_SIZE = 72 + MLX4_IB_UD_HEADER_SIZE = 76 +}; + +enum { + MLX4_RDMAOE_ETHERTYPE = 0x8915 }; struct mlx4_ib_sqp { @@ -62,7 +70,10 @@ struct mlx4_ib_sqp { int pkey_index; u32 qkey; u32 send_psn; - struct ib_ud_header ud_header; + union { + struct ib_ud_header ib; + struct eth_ud_header eth; + } hdr; u8 header_buf[MLX4_IB_UD_HEADER_SIZE]; }; @@ -782,18 +793,6 @@ int mlx4_ib_destroy_qp(struct ib_qp *qp) return 0; } -static int to_mlx4_st(enum ib_qp_type type) -{ - switch (type) { - case IB_QPT_RC: return MLX4_QP_ST_RC; - case IB_QPT_UC: return MLX4_QP_ST_UC; - case IB_QPT_UD: return MLX4_QP_ST_UD; - case IB_QPT_SMI: - case IB_QPT_GSI: return MLX4_QP_ST_MLX; - default: return -1; - } -} - static __be32 to_mlx4_access_flags(struct mlx4_ib_qp *qp, const struct ib_qp_attr *attr, int attr_mask) { @@ -843,6 +842,12 @@ static void mlx4_set_sched(struct mlx4_qp_path *path, u8 port) static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah, struct mlx4_qp_path *path, u8 port) { + int err; + int is_eth = ib_get_port_link_type(&dev->ib_dev, port) == + PORT_LINK_ETH ? 1 : 0; + u8 mac[6]; + int is_mcast; + path->grh_mylmc = ah->src_path_bits & 0x7f; path->rlid = cpu_to_be16(ah->dlid); if (ah->static_rate) { @@ -873,9 +878,36 @@ static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah, path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | ((port - 1) << 6) | ((ah->sl & 0xf) << 2); + if (is_eth) { + if (!(ah->ah_flags & IB_AH_GRH)) + return -1; + + err = mlx4_ib_resolve_grh(dev, ah, mac, &is_mcast); + if (err) + return err; + + memcpy(path->dmac_h, mac, 2); + memcpy(path->dmac_l, mac + 2, 4); + path->ackto = MLX4_IB_LINK_TYPE_ETH; + /* use index 0 into MAC table for RDMAoE */ + path->grh_mylmc &= 0x80; + } + return 0; } +static int to_mlx4_st(enum ib_qp_type type) +{ + switch (type) { + case IB_QPT_RC: return MLX4_QP_ST_RC; + case IB_QPT_UC: return MLX4_QP_ST_UC; + case IB_QPT_UD: return MLX4_QP_ST_UD; + case IB_QPT_SMI: + case IB_QPT_GSI: return MLX4_QP_ST_MLX; + default: return -1; + } +} + static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, int attr_mask, enum ib_qp_state cur_state, enum ib_qp_state new_state) @@ -972,7 +1004,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, } if (attr_mask & IB_QP_TIMEOUT) { - context->pri_path.ackto = attr->timeout << 3; + context->pri_path.ackto |= (attr->timeout << 3); optpar |= MLX4_QP_OPTPAR_ACK_TIMEOUT; } @@ -1206,8 +1238,8 @@ out: return err; } -static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, - void *wqe, unsigned *mlx_seg_len) +static int build_ib_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, + void *wqe, unsigned *mlx_seg_len) { struct ib_device *ib_dev = &to_mdev(sqp->qp.ibqp.device)->ib_dev; struct mlx4_wqe_mlx_seg *mlx = wqe; @@ -1223,61 +1255,171 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, for (i = 0; i < wr->num_sge; ++i) send_size += wr->sg_list[i].length; - ib_ud_header_init(send_size, mlx4_ib_ah_grh_present(ah), &sqp->ud_header); + ib_ud_header_init(send_size, mlx4_ib_ah_grh_present(ah), &sqp->hdr.ib); - sqp->ud_header.lrh.service_level = - be32_to_cpu(ah->av.sl_tclass_flowlabel) >> 28; - sqp->ud_header.lrh.destination_lid = ah->av.dlid; - sqp->ud_header.lrh.source_lid = cpu_to_be16(ah->av.g_slid & 0x7f); + sqp->hdr.ib.lrh.service_level = + be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28; + sqp->hdr.ib.lrh.destination_lid = ah->av.ib.dlid; + sqp->hdr.ib.lrh.source_lid = cpu_to_be16(ah->av.ib.g_slid & 0x7f); if (mlx4_ib_ah_grh_present(ah)) { - sqp->ud_header.grh.traffic_class = - (be32_to_cpu(ah->av.sl_tclass_flowlabel) >> 20) & 0xff; - sqp->ud_header.grh.flow_label = - ah->av.sl_tclass_flowlabel & cpu_to_be32(0xfffff); - sqp->ud_header.grh.hop_limit = ah->av.hop_limit; - ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.port_pd) >> 24, - ah->av.gid_index, &sqp->ud_header.grh.source_gid); - memcpy(sqp->ud_header.grh.destination_gid.raw, - ah->av.dgid, 16); + sqp->hdr.ib.grh.traffic_class = + (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20) & 0xff; + sqp->hdr.ib.grh.flow_label = + ah->av.ib.sl_tclass_flowlabel & cpu_to_be32(0xfffff); + sqp->hdr.ib.grh.hop_limit = ah->av.ib.hop_limit; + ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.ib.port_pd) >> 24, + ah->av.ib.gid_index, &sqp->hdr.ib.grh.source_gid); + memcpy(sqp->hdr.ib.grh.destination_gid.raw, + ah->av.ib.dgid, 16); } mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE); mlx->flags |= cpu_to_be32((!sqp->qp.ibqp.qp_num ? MLX4_WQE_MLX_VL15 : 0) | - (sqp->ud_header.lrh.destination_lid == + (sqp->hdr.ib.lrh.destination_lid == IB_LID_PERMISSIVE ? MLX4_WQE_MLX_SLR : 0) | - (sqp->ud_header.lrh.service_level << 8)); - mlx->rlid = sqp->ud_header.lrh.destination_lid; + (sqp->hdr.ib.lrh.service_level << 8)); + mlx->rlid = sqp->hdr.ib.lrh.destination_lid; + + switch (wr->opcode) { + case IB_WR_SEND: + sqp->hdr.ib.bth.opcode = IB_OPCODE_UD_SEND_ONLY; + sqp->hdr.ib.immediate_present = 0; + break; + case IB_WR_SEND_WITH_IMM: + sqp->hdr.ib.bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE; + sqp->hdr.ib.immediate_present = 1; + sqp->hdr.ib.immediate_data = wr->ex.imm_data; + break; + default: + return -EINVAL; + } + + sqp->hdr.ib.lrh.virtual_lane = !sqp->qp.ibqp.qp_num ? 15 : 0; + if (sqp->hdr.ib.lrh.destination_lid == IB_LID_PERMISSIVE) + sqp->hdr.ib.lrh.source_lid = IB_LID_PERMISSIVE; + sqp->hdr.ib.bth.solicited_event = !!(wr->send_flags & IB_SEND_SOLICITED); + if (!sqp->qp.ibqp.qp_num) + ib_get_cached_pkey(ib_dev, sqp->qp.port, sqp->pkey_index, &pkey); + else + ib_get_cached_pkey(ib_dev, sqp->qp.port, wr->wr.ud.pkey_index, &pkey); + sqp->hdr.ib.bth.pkey = cpu_to_be16(pkey); + sqp->hdr.ib.bth.destination_qpn = cpu_to_be32(wr->wr.ud.remote_qpn); + sqp->hdr.ib.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1)); + sqp->hdr.ib.deth.qkey = cpu_to_be32(wr->wr.ud.remote_qkey & 0x80000000 ? + sqp->qkey : wr->wr.ud.remote_qkey); + sqp->hdr.ib.deth.source_qpn = cpu_to_be32(sqp->qp.ibqp.qp_num); + + header_size = ib_ud_header_pack(&sqp->hdr.ib, sqp->header_buf); + + /* + * Inline data segments may not cross a 64 byte boundary. If + * our UD header is bigger than the space available up to the + * next 64 byte boundary in the WQE, use two inline data + * segments to hold the UD header. + */ + spc = MLX4_INLINE_ALIGN - + ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1)); + if (header_size <= spc) { + inl->byte_count = cpu_to_be32(1 << 31 | header_size); + memcpy(inl + 1, sqp->header_buf, header_size); + i = 1; + } else { + inl->byte_count = cpu_to_be32(1 << 31 | spc); + memcpy(inl + 1, sqp->header_buf, spc); + + inl = (void *) (inl + 1) + spc; + memcpy(inl + 1, sqp->header_buf + spc, header_size - spc); + /* + * Need a barrier here to make sure all the data is + * visible before the byte_count field is set. + * Otherwise the HCA prefetcher could grab the 64-byte + * chunk with this inline segment and get a valid (!= + * 0xffffffff) byte count but stale data, and end up + * generating a packet with bad headers. + * + * The first inline segment's byte_count field doesn't + * need a barrier, because it comes after a + * control/MLX segment and therefore is at an offset + * of 16 mod 64. + */ + wmb(); + inl->byte_count = cpu_to_be32(1 << 31 | (header_size - spc)); + i = 2; + } + + *mlx_seg_len = + ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + header_size, 16); + return 0; +} + +static int build_eth_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, + void *wqe, unsigned *mlx_seg_len) +{ + struct ib_device *ib_dev = &to_mdev(sqp->qp.ibqp.device)->ib_dev; + struct mlx4_wqe_mlx_seg *mlx = wqe; + struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx; + struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah); + u16 pkey; + int send_size; + int header_size; + int spc; + int i; + void *tmp; + + send_size = 0; + for (i = 0; i < wr->num_sge; ++i) + send_size += wr->sg_list[i].length; + + ib_rdmaoe_ud_header_init(send_size, mlx4_ib_ah_grh_present(ah), &sqp->hdr.eth); + + if (mlx4_ib_ah_grh_present(ah)) { + sqp->hdr.eth.grh.traffic_class = + (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20) & 0xff; + sqp->hdr.eth.grh.flow_label = + ah->av.ib.sl_tclass_flowlabel & cpu_to_be32(0xfffff); + sqp->hdr.eth.grh.hop_limit = ah->av.ib.hop_limit; + ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.ib.port_pd) >> 24, + ah->av.ib.gid_index, &sqp->hdr.eth.grh.source_gid); + memcpy(sqp->hdr.eth.grh.destination_gid.raw, + ah->av.ib.dgid, 16); + } + + mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE); switch (wr->opcode) { case IB_WR_SEND: - sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY; - sqp->ud_header.immediate_present = 0; + sqp->hdr.eth.bth.opcode = IB_OPCODE_UD_SEND_ONLY; + sqp->hdr.eth.immediate_present = 0; break; case IB_WR_SEND_WITH_IMM: - sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE; - sqp->ud_header.immediate_present = 1; - sqp->ud_header.immediate_data = wr->ex.imm_data; + sqp->hdr.eth.bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE; + sqp->hdr.eth.immediate_present = 1; + sqp->hdr.eth.immediate_data = wr->ex.imm_data; break; default: return -EINVAL; } - sqp->ud_header.lrh.virtual_lane = !sqp->qp.ibqp.qp_num ? 15 : 0; - if (sqp->ud_header.lrh.destination_lid == IB_LID_PERMISSIVE) - sqp->ud_header.lrh.source_lid = IB_LID_PERMISSIVE; - sqp->ud_header.bth.solicited_event = !!(wr->send_flags & IB_SEND_SOLICITED); + memcpy(sqp->hdr.eth.eth.dmac_h, ah->av.eth.mac_0_1, 2); + memcpy(sqp->hdr.eth.eth.dmac_h + 2, ah->av.eth.mac_2_5, 2); + memcpy(sqp->hdr.eth.eth.dmac_l, ah->av.eth.mac_2_5 + 2, 2); + tmp = to_mdev(sqp->qp.ibqp.device)->rdmaoe.netdevs[sqp->qp.port - 1]->dev_addr; + memcpy(sqp->hdr.eth.eth.smac_h, tmp, 2); + memcpy(sqp->hdr.eth.eth.smac_l, tmp + 2, 4); + sqp->hdr.eth.eth.type = cpu_to_be16(MLX4_RDMAOE_ETHERTYPE); + sqp->hdr.eth.bth.solicited_event = !!(wr->send_flags & IB_SEND_SOLICITED); if (!sqp->qp.ibqp.qp_num) ib_get_cached_pkey(ib_dev, sqp->qp.port, sqp->pkey_index, &pkey); else ib_get_cached_pkey(ib_dev, sqp->qp.port, wr->wr.ud.pkey_index, &pkey); - sqp->ud_header.bth.pkey = cpu_to_be16(pkey); - sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->wr.ud.remote_qpn); - sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1)); - sqp->ud_header.deth.qkey = cpu_to_be32(wr->wr.ud.remote_qkey & 0x80000000 ? + sqp->hdr.eth.bth.pkey = cpu_to_be16(pkey); + sqp->hdr.eth.bth.destination_qpn = cpu_to_be32(wr->wr.ud.remote_qpn); + sqp->hdr.eth.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1)); + sqp->hdr.eth.deth.qkey = cpu_to_be32(wr->wr.ud.remote_qkey & 0x80000000 ? sqp->qkey : wr->wr.ud.remote_qkey); - sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.ibqp.qp_num); + sqp->hdr.eth.deth.source_qpn = cpu_to_be32(sqp->qp.ibqp.qp_num); - header_size = ib_ud_header_pack(&sqp->ud_header, sqp->header_buf); + header_size = rdmaoe_ud_header_pack(&sqp->hdr.eth, sqp->header_buf); if (0) { printk(KERN_ERR "built UD header of size %d:\n", header_size); @@ -1333,6 +1475,15 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, return 0; } +static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, + void *wqe, unsigned *mlx_seg_len) +{ + if (ib_get_port_link_type(sqp->qp.ibqp.device, sqp->qp.port) == PORT_LINK_IB) + return build_ib_mlx_header(sqp, wr, wqe, mlx_seg_len); + else + return build_eth_mlx_header(sqp, wr, wqe, mlx_seg_len); +} + static int mlx4_wq_overflow(struct mlx4_ib_wq *wq, int nreq, struct ib_cq *ib_cq) { unsigned cur; @@ -1414,6 +1565,8 @@ static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg, memcpy(dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof (struct mlx4_av)); dseg->dqpn = cpu_to_be32(wr->wr.ud.remote_qpn); dseg->qkey = cpu_to_be32(wr->wr.ud.remote_qkey); + dseg->vlan = to_mah(wr->wr.ud.ah)->av.eth.vlan; + memcpy(dseg->mac_0_1, to_mah(wr->wr.ud.ah)->av.eth.mac_0_1, 6); } static void set_mlx_icrc_seg(void *dseg) diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 3aff8a6..b73b5f0 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -66,7 +66,8 @@ enum { MLX4_DEV_CAP_FLAG_ATOMIC = 1 << 18, MLX4_DEV_CAP_FLAG_RAW_MCAST = 1 << 19, MLX4_DEV_CAP_FLAG_UD_AV_PORT = 1 << 20, - MLX4_DEV_CAP_FLAG_UD_MCAST = 1 << 21 + MLX4_DEV_CAP_FLAG_UD_MCAST = 1 << 21, + MLX4_DEV_CAP_FLAG_RDMAOE = 1 << 30 }; enum { @@ -371,6 +372,28 @@ struct mlx4_av { u8 dgid[16]; }; +struct mlx4_eth_av { + __be32 port_pd; + u8 reserved1; + u8 smac_idx; + u16 reserved2; + u8 reserved3; + u8 gid_index; + u8 stat_rate; + u8 hop_limit; + __be32 sl_tclass_flowlabel; + u8 dgid[16]; + u32 reserved4[2]; + __be16 vlan; + u8 mac_0_1[2]; + u8 mac_2_5[4]; +}; + +union mlx4_ext_av { + struct mlx4_av ib; + struct mlx4_eth_av eth; +}; + struct mlx4_dev { struct pci_dev *pdev; unsigned long flags; @@ -399,6 +422,12 @@ struct mlx4_init_port_param { if (((type) == MLX4_PORT_TYPE_IB ? (dev)->caps.port_mask : \ ~(dev)->caps.port_mask) & 1 << ((port) - 1)) +#define mlx4_foreach_ib_transport_port(port, dev) \ + for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++) \ + if (((dev)->caps.port_mask & 1 << ((port) - 1)) || \ + ((dev)->caps.flags & MLX4_DEV_CAP_FLAG_RDMAOE)) + + int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct, struct mlx4_buf *buf); void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf); diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h index bf8f119..d73534f 100644 --- a/include/linux/mlx4/qp.h +++ b/include/linux/mlx4/qp.h @@ -112,7 +112,9 @@ struct mlx4_qp_path { u8 snooper_flags; u8 reserved3[2]; u8 counter_index; - u8 reserved4[7]; + u8 reserved4; + u8 dmac_h[2]; + u8 dmac_l[4]; }; struct mlx4_qp_context { @@ -218,7 +220,9 @@ struct mlx4_wqe_datagram_seg { __be32 av[8]; __be32 dqpn; __be32 qkey; - __be32 reservd[2]; + __be16 vlan; + u8 mac_0_1[2]; + u8 mac_2_5[4]; }; struct mlx4_wqe_lso_seg { -- 1.6.3.3 _______________________________________________ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general