On Sun, Mar 15, 2015 at 05:16:16PM +0200, Yuval Shaia wrote: > Hi, > I didn't got any further comments on this one. > Any idea why SG in CM is un-welcome? By mistake I sent a private mail only. Cc: Roland Dreier <rol...@kernel.org> Cc: Sean Hefty <sean.he...@intel.com> Cc: Hal Rosenstock <hal.rosenst...@gmail.com>
Your advice would be very appreciated. Yuval > > I have one small fix to this patch but it is 99% ready. > > Yuval > > On Sun, Feb 01, 2015 at 09:09:42AM +0200, Yuval Shaia wrote: > > On Wed, Jan 28, 2015 at 12:36:27PM +0100, Yann Droneaud wrote: > > > Hi, > > > > > > Le mardi 27 janvier 2015 à 03:21 -0800, Yuval Shaia a écrit : > > > > With this fix Scatter-Gather will be supported also in connected mode > > > > > > > > > > Please explain the issue with current kernel and the advantage > > > of the proposed fix in the commit message. Perhaps benchmarks > > > against various HCAs would be welcome. > > The benefits of using SG are well known. > > I think the question should be the other way around - why not supporting SG > > in CM? > > > > I will add the following text to commit message: > > By default, IPoIB-CM driver uses 64k MTU. Larger MTU gives better > > performance. > > This MTU plus overhead puts the memory allocation for IP based packets at > > 32 4k pages (order 5), which have to be contiguous. > > When the system memory under pressure, it was observed that allocating 128k > > contiguous physical memory is difficult and causes serious errors (such as > > system becomes unusable). > > This enhancement resolve the issue by removing the physically contiguous > > memory requirement using Scatter/Gather feature that exists in Linux stack. > > > > > > > Signed-off-by: Yuval Shaia <yuval.sh...@oracle.com> > > > > --- > > > > drivers/infiniband/ulp/ipoib/ipoib.h | 8 +-- > > > > drivers/infiniband/ulp/ipoib/ipoib_cm.c | 107 > > > > +++++++++++++++++++++++------ > > > > drivers/infiniband/ulp/ipoib/ipoib_ib.c | 3 +- > > > > drivers/infiniband/ulp/ipoib/ipoib_main.c | 3 +- > > > > 4 files changed, 91 insertions(+), 30 deletions(-) > > > > > > > > diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h > > > > b/drivers/infiniband/ulp/ipoib/ipoib.h > > > > index d7562be..fb42834 100644 > > > > --- a/drivers/infiniband/ulp/ipoib/ipoib.h > > > > +++ b/drivers/infiniband/ulp/ipoib/ipoib.h > > > > @@ -170,11 +170,6 @@ struct ipoib_tx_buf { > > > > u64 mapping[MAX_SKB_FRAGS + 1]; > > > > }; > > > > > > > > -struct ipoib_cm_tx_buf { > > > > - struct sk_buff *skb; > > > > - u64 mapping; > > > > -}; > > > > - > > > > struct ib_cm_id; > > > > > > > > struct ipoib_cm_data { > > > > @@ -233,7 +228,7 @@ struct ipoib_cm_tx { > > > > struct net_device *dev; > > > > struct ipoib_neigh *neigh; > > > > struct ipoib_path *path; > > > > - struct ipoib_cm_tx_buf *tx_ring; > > > > + struct ipoib_tx_buf *tx_ring; > > > > unsigned tx_head; > > > > unsigned tx_tail; > > > > unsigned long flags; > > > > @@ -496,6 +491,7 @@ int ipoib_mcast_stop_thread(struct net_device *dev, > > > > int flush); > > > > > > > > void ipoib_mcast_dev_down(struct net_device *dev); > > > > void ipoib_mcast_dev_flush(struct net_device *dev); > > > > +int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf > > > > *tx_req); > > > > > > > > #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG > > > > struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev); > > > > diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c > > > > b/drivers/infiniband/ulp/ipoib/ipoib_cm.c > > > > index 933efce..056e4d2 100644 > > > > --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c > > > > +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c > > > > @@ -88,6 +88,31 @@ static void ipoib_cm_dma_unmap_rx(struct > > > > ipoib_dev_priv *priv, int frags, > > > > ib_dma_unmap_page(priv->ca, mapping[i + 1], PAGE_SIZE, > > > > DMA_FROM_DEVICE); > > > > } > > > > > > > > +static void ipoib_cm_dma_unmap_tx(struct ipoib_dev_priv *priv, > > > > + struct ipoib_tx_buf *tx_req) > > > > +{ > > > > + struct sk_buff *skb; > > > > + int i, offs; > > > > + > > > > + skb = tx_req->skb; > > > > + if (skb_shinfo(skb)->nr_frags) { > > > > + offs = 0; > > > > + if (skb_headlen(skb)) { > > > > + ib_dma_unmap_single(priv->ca, > > > > tx_req->mapping[0], > > > > + skb_headlen(skb), > > > > DMA_TO_DEVICE); > > > > + offs = 1; > > > > + } > > > > + for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) { > > > > + const skb_frag_t *frag = > > > > &skb_shinfo(skb)->frags[i]; > > > > + > > > > + ib_dma_unmap_single(priv->ca, tx_req->mapping[i > > > > + offs], > > > > + skb_frag_size(frag), > > > > DMA_TO_DEVICE); > > > > + } > > > > + } else > > > > + ib_dma_unmap_single(priv->ca, tx_req->mapping[0], > > > > skb->len, > > > > + DMA_TO_DEVICE); > > > > +} > > > > + > > > > static int ipoib_cm_post_receive_srq(struct net_device *dev, int id) > > > > { > > > > struct ipoib_dev_priv *priv = netdev_priv(dev); > > > > @@ -707,11 +732,39 @@ static inline int post_send(struct ipoib_dev_priv > > > > *priv, > > > > return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr); > > > > } > > > > > > > > +static inline int post_send_sg(struct ipoib_dev_priv *priv, > > > > + struct ipoib_cm_tx *tx, > > > > + unsigned int wr_id, > > > > + struct sk_buff *skb, > > > > + u64 mapping[MAX_SKB_FRAGS + 1]) > > > > +{ > > > > + struct ib_send_wr *bad_wr; > > > > + int i, off; > > > > + skb_frag_t *frags = skb_shinfo(skb)->frags; > > > > + int nr_frags = skb_shinfo(skb)->nr_frags; > > > > + > > > > + if (skb_headlen(skb)) { > > > > + priv->tx_sge[0].addr = mapping[0]; > > > > + priv->tx_sge[0].length = skb_headlen(skb); > > > > + off = 1; > > > > + } else > > > > + off = 0; > > > > + > > > > + for (i = 0; i < nr_frags; ++i) { > > > > + priv->tx_sge[i + off].addr = mapping[i + off]; > > > > + priv->tx_sge[i + off].length = frags[i].size; > > > > + } > > > > + priv->tx_wr.num_sge = nr_frags + off; > > > > + priv->tx_wr.wr_id = wr_id | IPOIB_OP_CM; > > > > + > > > > + return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr); > > > > +} > > > > + > > > > void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct > > > > ipoib_cm_tx *tx) > > > > { > > > > struct ipoib_dev_priv *priv = netdev_priv(dev); > > > > - struct ipoib_cm_tx_buf *tx_req; > > > > - u64 addr; > > > > + struct ipoib_tx_buf *tx_req; > > > > + u64 addr = 0; > > > > int rc; > > > > > > > > if (unlikely(skb->len > tx->mtu)) { > > > > @@ -735,24 +788,37 @@ void ipoib_cm_send(struct net_device *dev, struct > > > > sk_buff *skb, struct ipoib_cm_ > > > > */ > > > > tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)]; > > > > tx_req->skb = skb; > > > > - addr = ib_dma_map_single(priv->ca, skb->data, skb->len, > > > > DMA_TO_DEVICE); > > > > - if (unlikely(ib_dma_mapping_error(priv->ca, addr))) { > > > > - ++dev->stats.tx_errors; > > > > - dev_kfree_skb_any(skb); > > > > - return; > > > > - } > > > > > > > > - tx_req->mapping = addr; > > > > + if (skb_shinfo(skb)->nr_frags) { > > > > + if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req))) { > > > > + ++dev->stats.tx_errors; > > > > + dev_kfree_skb_any(skb); > > > > + return; > > > > + } > > > > + rc = post_send_sg(priv, tx, tx->tx_head & > > > > + (ipoib_sendq_size - 1), > > > > + skb, tx_req->mapping); > > > > + } else { > > > > + addr = ib_dma_map_single(priv->ca, skb->data, skb->len, > > > > + DMA_TO_DEVICE); > > > > + if (unlikely(ib_dma_mapping_error(priv->ca, addr))) { > > > > + ++dev->stats.tx_errors; > > > > + dev_kfree_skb_any(skb); > > > > + return; > > > > + } > > > > + > > > > + tx_req->mapping[0] = addr; > > > > > > > > - skb_orphan(skb); > > > > - skb_dst_drop(skb); > > > > + skb_orphan(skb); > > > > + skb_dst_drop(skb); > > > > > > > > - rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1), > > > > - addr, skb->len); > > > > + rc = post_send(priv, tx, tx->tx_head & > > > > (ipoib_sendq_size - 1), > > > > + addr, skb->len); > > > > + } > > > > if (unlikely(rc)) { > > > > ipoib_warn(priv, "post_send failed, error %d\n", rc); > > > > ++dev->stats.tx_errors; > > > > - ib_dma_unmap_single(priv->ca, addr, skb->len, > > > > DMA_TO_DEVICE); > > > > + ipoib_cm_dma_unmap_tx(priv, tx_req); > > > > dev_kfree_skb_any(skb); > > > > } else { > > > > dev->trans_start = jiffies; > > > > @@ -777,7 +843,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, > > > > struct ib_wc *wc) > > > > struct ipoib_dev_priv *priv = netdev_priv(dev); > > > > struct ipoib_cm_tx *tx = wc->qp->qp_context; > > > > unsigned int wr_id = wc->wr_id & ~IPOIB_OP_CM; > > > > - struct ipoib_cm_tx_buf *tx_req; > > > > + struct ipoib_tx_buf *tx_req; > > > > unsigned long flags; > > > > > > > > ipoib_dbg_data(priv, "cm send completion: id %d, status: %d\n", > > > > @@ -791,7 +857,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, > > > > struct ib_wc *wc) > > > > > > > > tx_req = &tx->tx_ring[wr_id]; > > > > > > > > - ib_dma_unmap_single(priv->ca, tx_req->mapping, > > > > tx_req->skb->len, DMA_TO_DEVICE); > > > > + ipoib_cm_dma_unmap_tx(priv, tx_req); > > > > > > > > /* FIXME: is this right? Shouldn't we only increment on > > > > success? */ > > > > ++dev->stats.tx_packets; > > > > @@ -1036,6 +1102,9 @@ static struct ib_qp *ipoib_cm_create_tx_qp(struct > > > > net_device *dev, struct ipoib_ > > > > > > > > struct ib_qp *tx_qp; > > > > > > > > + if (dev->features & NETIF_F_SG) > > > > + attr.cap.max_send_sge = MAX_SKB_FRAGS + 1; > > > > + > > > > tx_qp = ib_create_qp(priv->pd, &attr); > > > > if (PTR_ERR(tx_qp) == -EINVAL) { > > > > ipoib_warn(priv, "can't use GFP_NOIO for QPs on device > > > > %s, using GFP_KERNEL\n", > > > > @@ -1170,7 +1239,7 @@ err_tx: > > > > static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p) > > > > { > > > > struct ipoib_dev_priv *priv = netdev_priv(p->dev); > > > > - struct ipoib_cm_tx_buf *tx_req; > > > > + struct ipoib_tx_buf *tx_req; > > > > unsigned long begin; > > > > > > > > ipoib_dbg(priv, "Destroy active connection 0x%x head 0x%x tail > > > > 0x%x\n", > > > > @@ -1197,8 +1266,7 @@ timeout: > > > > > > > > while ((int) p->tx_tail - (int) p->tx_head < 0) { > > > > tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - > > > > 1)]; > > > > - ib_dma_unmap_single(priv->ca, tx_req->mapping, > > > > tx_req->skb->len, > > > > - DMA_TO_DEVICE); > > > > + ipoib_cm_dma_unmap_tx(priv, tx_req); > > > > dev_kfree_skb_any(tx_req->skb); > > > > ++p->tx_tail; > > > > netif_tx_lock_bh(p->dev); > > > > @@ -1455,7 +1523,6 @@ static void ipoib_cm_stale_task(struct > > > > work_struct *work) > > > > spin_unlock_irq(&priv->lock); > > > > } > > > > > > > > - > > > > > > No need to remove this line. > > > > > > > static ssize_t show_mode(struct device *d, struct device_attribute > > > > *attr, > > > > char *buf) > > > > { > > > > diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c > > > > b/drivers/infiniband/ulp/ipoib/ipoib_ib.c > > > > index 72626c3..9e11447 100644 > > > > --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c > > > > +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c > > > > @@ -312,8 +312,7 @@ repost: > > > > "for buf %d\n", wr_id); > > > > } > > > > > > > > -static int ipoib_dma_map_tx(struct ib_device *ca, > > > > - struct ipoib_tx_buf *tx_req) > > > > +int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req) > > > > { > > > > struct sk_buff *skb = tx_req->skb; > > > > u64 *mapping = tx_req->mapping; > > > > diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c > > > > b/drivers/infiniband/ulp/ipoib/ipoib_main.c > > > > index 58b5aa3..f9314c6 100644 > > > > --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c > > > > +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c > > > > @@ -190,7 +190,7 @@ static netdev_features_t ipoib_fix_features(struct > > > > net_device *dev, netdev_featu > > > > struct ipoib_dev_priv *priv = netdev_priv(dev); > > > > > > > > if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags)) > > > > - features &= ~(NETIF_F_SG | NETIF_F_IP_CSUM | > > > > NETIF_F_TSO); > > > > + features &= ~(NETIF_F_IP_CSUM | NETIF_F_TSO); > > > > > > > > return features; > > > > } > > > > @@ -233,7 +233,6 @@ int ipoib_set_mode(struct net_device *dev, const > > > > char *buf) > > > > "will cause multicast packet drops\n"); > > > > netdev_update_features(dev); > > > > rtnl_unlock(); > > > > - priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM; > > > > > > Sure about this removal ? I don't see it related to scatter-gather. > > > > > > > > > > > ipoib_flush_paths(dev); > > > > rtnl_lock(); > > > > > > Regards. > > > > > > -- > > > Yann Droneaud > > > OPTEYA > > > > > > -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html