On Sun, Mar 15, 2015 at 05:16:16PM +0200, Yuval Shaia wrote:
> Hi,
> I didn't got any further comments on this one.
> Any idea why SG in CM is un-welcome?
By mistake I sent a private mail only.
Cc: Roland Dreier <rol...@kernel.org>
Cc: Sean Hefty <sean.he...@intel.com>
Cc: Hal Rosenstock <hal.rosenst...@gmail.com>

Your advice would be very appreciated.

Yuval
> 
> I have one small fix to this patch but it is 99% ready.
> 
> Yuval
> 
> On Sun, Feb 01, 2015 at 09:09:42AM +0200, Yuval Shaia wrote:
> > On Wed, Jan 28, 2015 at 12:36:27PM +0100, Yann Droneaud wrote:
> > > Hi,
> > > 
> > > Le mardi 27 janvier 2015 à 03:21 -0800, Yuval Shaia a écrit :
> > > > With this fix Scatter-Gather will be supported also in connected mode
> > > > 
> > > 
> > > Please explain the issue with current kernel and the advantage
> > > of the proposed fix in the commit message. Perhaps benchmarks
> > > against various HCAs would be welcome.
> > The benefits of using SG are well known.
> > I think the question should be the other way around - why not supporting SG 
> > in CM?
> > 
> > I will add the following text to commit message:
> > By default, IPoIB-CM driver uses 64k MTU. Larger MTU gives better 
> > performance.
> > This MTU plus overhead puts the memory allocation for IP based packets at 
> > 32 4k pages (order 5), which have to be contiguous.
> > When the system memory under pressure, it was observed that allocating 128k 
> > contiguous physical memory is difficult and causes serious errors (such as 
> > system becomes unusable).
> > This enhancement resolve the issue by removing the physically contiguous 
> > memory requirement using Scatter/Gather feature that exists in Linux stack.
> > > 
> > > > Signed-off-by: Yuval Shaia <yuval.sh...@oracle.com>
> > > > ---
> > > >  drivers/infiniband/ulp/ipoib/ipoib.h      |    8 +--
> > > >  drivers/infiniband/ulp/ipoib/ipoib_cm.c   |  107 
> > > > +++++++++++++++++++++++------
> > > >  drivers/infiniband/ulp/ipoib/ipoib_ib.c   |    3 +-
> > > >  drivers/infiniband/ulp/ipoib/ipoib_main.c |    3 +-
> > > >  4 files changed, 91 insertions(+), 30 deletions(-)
> > > > 
> > > > diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h 
> > > > b/drivers/infiniband/ulp/ipoib/ipoib.h
> > > > index d7562be..fb42834 100644
> > > > --- a/drivers/infiniband/ulp/ipoib/ipoib.h
> > > > +++ b/drivers/infiniband/ulp/ipoib/ipoib.h
> > > > @@ -170,11 +170,6 @@ struct ipoib_tx_buf {
> > > >         u64             mapping[MAX_SKB_FRAGS + 1];
> > > >  };
> > > >  
> > > > -struct ipoib_cm_tx_buf {
> > > > -       struct sk_buff *skb;
> > > > -       u64             mapping;
> > > > -};
> > > > -
> > > >  struct ib_cm_id;
> > > >  
> > > >  struct ipoib_cm_data {
> > > > @@ -233,7 +228,7 @@ struct ipoib_cm_tx {
> > > >         struct net_device   *dev;
> > > >         struct ipoib_neigh  *neigh;
> > > >         struct ipoib_path   *path;
> > > > -       struct ipoib_cm_tx_buf *tx_ring;
> > > > +       struct ipoib_tx_buf *tx_ring;
> > > >         unsigned             tx_head;
> > > >         unsigned             tx_tail;
> > > >         unsigned long        flags;
> > > > @@ -496,6 +491,7 @@ int ipoib_mcast_stop_thread(struct net_device *dev, 
> > > > int flush);
> > > >  
> > > >  void ipoib_mcast_dev_down(struct net_device *dev);
> > > >  void ipoib_mcast_dev_flush(struct net_device *dev);
> > > > +int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf 
> > > > *tx_req);
> > > >  
> > > >  #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
> > > >  struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev);
> > > > diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c 
> > > > b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
> > > > index 933efce..056e4d2 100644
> > > > --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
> > > > +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
> > > > @@ -88,6 +88,31 @@ static void ipoib_cm_dma_unmap_rx(struct 
> > > > ipoib_dev_priv *priv, int frags,
> > > >                 ib_dma_unmap_page(priv->ca, mapping[i + 1], PAGE_SIZE, 
> > > > DMA_FROM_DEVICE);
> > > >  }
> > > >  
> > > > +static void ipoib_cm_dma_unmap_tx(struct ipoib_dev_priv *priv,
> > > > +                                 struct ipoib_tx_buf *tx_req)
> > > > +{
> > > > +       struct sk_buff *skb;
> > > > +       int i, offs;
> > > > +
> > > > +       skb = tx_req->skb;
> > > > +       if (skb_shinfo(skb)->nr_frags) {
> > > > +               offs = 0;
> > > > +               if (skb_headlen(skb)) {
> > > > +                       ib_dma_unmap_single(priv->ca, 
> > > > tx_req->mapping[0],
> > > > +                                           skb_headlen(skb), 
> > > > DMA_TO_DEVICE);
> > > > +                       offs = 1;
> > > > +               }
> > > > +               for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) {
> > > > +                       const skb_frag_t *frag = 
> > > > &skb_shinfo(skb)->frags[i];
> > > > +
> > > > +                       ib_dma_unmap_single(priv->ca, tx_req->mapping[i 
> > > > + offs],
> > > > +                                           skb_frag_size(frag), 
> > > > DMA_TO_DEVICE);
> > > > +               }
> > > > +       } else
> > > > +               ib_dma_unmap_single(priv->ca, tx_req->mapping[0], 
> > > > skb->len,
> > > > +                                   DMA_TO_DEVICE);
> > > > +}
> > > > +
> > > >  static int ipoib_cm_post_receive_srq(struct net_device *dev, int id)
> > > >  {
> > > >         struct ipoib_dev_priv *priv = netdev_priv(dev);
> > > > @@ -707,11 +732,39 @@ static inline int post_send(struct ipoib_dev_priv 
> > > > *priv,
> > > >         return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr);
> > > >  }
> > > >  
> > > > +static inline int post_send_sg(struct ipoib_dev_priv *priv,
> > > > +                              struct ipoib_cm_tx *tx,
> > > > +                              unsigned int wr_id,
> > > > +                              struct sk_buff *skb,
> > > > +                              u64 mapping[MAX_SKB_FRAGS + 1])
> > > > +{
> > > > +       struct ib_send_wr *bad_wr;
> > > > +       int i, off;
> > > > +       skb_frag_t *frags = skb_shinfo(skb)->frags;
> > > > +       int nr_frags = skb_shinfo(skb)->nr_frags;
> > > > +
> > > > +       if (skb_headlen(skb)) {
> > > > +               priv->tx_sge[0].addr = mapping[0];
> > > > +               priv->tx_sge[0].length = skb_headlen(skb);
> > > > +               off = 1;
> > > > +       } else
> > > > +               off = 0;
> > > > +
> > > > +       for (i = 0; i < nr_frags; ++i) {
> > > > +               priv->tx_sge[i + off].addr = mapping[i + off];
> > > > +               priv->tx_sge[i + off].length = frags[i].size;
> > > > +       }
> > > > +       priv->tx_wr.num_sge = nr_frags + off;
> > > > +       priv->tx_wr.wr_id = wr_id | IPOIB_OP_CM;
> > > > +
> > > > +       return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr);
> > > > +}
> > > > +
> > > >  void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct 
> > > > ipoib_cm_tx *tx)
> > > >  {
> > > >         struct ipoib_dev_priv *priv = netdev_priv(dev);
> > > > -       struct ipoib_cm_tx_buf *tx_req;
> > > > -       u64 addr;
> > > > +       struct ipoib_tx_buf *tx_req;
> > > > +       u64 addr = 0;
> > > >         int rc;
> > > >  
> > > >         if (unlikely(skb->len > tx->mtu)) {
> > > > @@ -735,24 +788,37 @@ void ipoib_cm_send(struct net_device *dev, struct 
> > > > sk_buff *skb, struct ipoib_cm_
> > > >          */
> > > >         tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)];
> > > >         tx_req->skb = skb;
> > > > -       addr = ib_dma_map_single(priv->ca, skb->data, skb->len, 
> > > > DMA_TO_DEVICE);
> > > > -       if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
> > > > -               ++dev->stats.tx_errors;
> > > > -               dev_kfree_skb_any(skb);
> > > > -               return;
> > > > -       }
> > > >  
> > > > -       tx_req->mapping = addr;
> > > > +       if (skb_shinfo(skb)->nr_frags) {
> > > > +               if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req))) {
> > > > +                       ++dev->stats.tx_errors;
> > > > +                       dev_kfree_skb_any(skb);
> > > > +                       return;
> > > > +               }
> > > > +               rc = post_send_sg(priv, tx, tx->tx_head &
> > > > +                                 (ipoib_sendq_size - 1),
> > > > +                                 skb, tx_req->mapping);
> > > > +       } else {
> > > > +               addr = ib_dma_map_single(priv->ca, skb->data, skb->len,
> > > > +                                        DMA_TO_DEVICE);
> > > > +               if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
> > > > +                       ++dev->stats.tx_errors;
> > > > +                       dev_kfree_skb_any(skb);
> > > > +                       return;
> > > > +               }
> > > > +
> > > > +               tx_req->mapping[0] = addr;
> > > >  
> > > > -       skb_orphan(skb);
> > > > -       skb_dst_drop(skb);
> > > > +               skb_orphan(skb);
> > > > +               skb_dst_drop(skb);
> > > >  
> > > > -       rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1),
> > > > -                      addr, skb->len);
> > > > +               rc = post_send(priv, tx, tx->tx_head & 
> > > > (ipoib_sendq_size - 1),
> > > > +                              addr, skb->len);
> > > > +       }
> > > >         if (unlikely(rc)) {
> > > >                 ipoib_warn(priv, "post_send failed, error %d\n", rc);
> > > >                 ++dev->stats.tx_errors;
> > > > -               ib_dma_unmap_single(priv->ca, addr, skb->len, 
> > > > DMA_TO_DEVICE);
> > > > +               ipoib_cm_dma_unmap_tx(priv, tx_req);
> > > >                 dev_kfree_skb_any(skb);
> > > >         } else {
> > > >                 dev->trans_start = jiffies;
> > > > @@ -777,7 +843,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, 
> > > > struct ib_wc *wc)
> > > >         struct ipoib_dev_priv *priv = netdev_priv(dev);
> > > >         struct ipoib_cm_tx *tx = wc->qp->qp_context;
> > > >         unsigned int wr_id = wc->wr_id & ~IPOIB_OP_CM;
> > > > -       struct ipoib_cm_tx_buf *tx_req;
> > > > +       struct ipoib_tx_buf *tx_req;
> > > >         unsigned long flags;
> > > >  
> > > >         ipoib_dbg_data(priv, "cm send completion: id %d, status: %d\n",
> > > > @@ -791,7 +857,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, 
> > > > struct ib_wc *wc)
> > > >  
> > > >         tx_req = &tx->tx_ring[wr_id];
> > > >  
> > > > -       ib_dma_unmap_single(priv->ca, tx_req->mapping, 
> > > > tx_req->skb->len, DMA_TO_DEVICE);
> > > > +       ipoib_cm_dma_unmap_tx(priv, tx_req);
> > > >  
> > > >         /* FIXME: is this right? Shouldn't we only increment on 
> > > > success? */
> > > >         ++dev->stats.tx_packets;
> > > > @@ -1036,6 +1102,9 @@ static struct ib_qp *ipoib_cm_create_tx_qp(struct 
> > > > net_device *dev, struct ipoib_
> > > >  
> > > >         struct ib_qp *tx_qp;
> > > >  
> > > > +       if (dev->features & NETIF_F_SG)
> > > > +               attr.cap.max_send_sge = MAX_SKB_FRAGS + 1;
> > > > +
> > > >         tx_qp = ib_create_qp(priv->pd, &attr);
> > > >         if (PTR_ERR(tx_qp) == -EINVAL) {
> > > >                 ipoib_warn(priv, "can't use GFP_NOIO for QPs on device 
> > > > %s, using GFP_KERNEL\n",
> > > > @@ -1170,7 +1239,7 @@ err_tx:
> > > >  static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p)
> > > >  {
> > > >         struct ipoib_dev_priv *priv = netdev_priv(p->dev);
> > > > -       struct ipoib_cm_tx_buf *tx_req;
> > > > +       struct ipoib_tx_buf *tx_req;
> > > >         unsigned long begin;
> > > >  
> > > >         ipoib_dbg(priv, "Destroy active connection 0x%x head 0x%x tail 
> > > > 0x%x\n",
> > > > @@ -1197,8 +1266,7 @@ timeout:
> > > >  
> > > >         while ((int) p->tx_tail - (int) p->tx_head < 0) {
> > > >                 tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 
> > > > 1)];
> > > > -               ib_dma_unmap_single(priv->ca, tx_req->mapping, 
> > > > tx_req->skb->len,
> > > > -                                   DMA_TO_DEVICE);
> > > > +               ipoib_cm_dma_unmap_tx(priv, tx_req);
> > > >                 dev_kfree_skb_any(tx_req->skb);
> > > >                 ++p->tx_tail;
> > > >                 netif_tx_lock_bh(p->dev);
> > > > @@ -1455,7 +1523,6 @@ static void ipoib_cm_stale_task(struct 
> > > > work_struct *work)
> > > >         spin_unlock_irq(&priv->lock);
> > > >  }
> > > >  
> > > > -
> > > 
> > > No need to remove this line.
> > > 
> > > >  static ssize_t show_mode(struct device *d, struct device_attribute 
> > > > *attr,
> > > >                          char *buf)
> > > >  {
> > > > diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c 
> > > > b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> > > > index 72626c3..9e11447 100644
> > > > --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> > > > +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> > > > @@ -312,8 +312,7 @@ repost:
> > > >                            "for buf %d\n", wr_id);
> > > >  }
> > > >  
> > > > -static int ipoib_dma_map_tx(struct ib_device *ca,
> > > > -                           struct ipoib_tx_buf *tx_req)
> > > > +int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req)
> > > >  {
> > > >         struct sk_buff *skb = tx_req->skb;
> > > >         u64 *mapping = tx_req->mapping;
> > > > diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c 
> > > > b/drivers/infiniband/ulp/ipoib/ipoib_main.c
> > > > index 58b5aa3..f9314c6 100644
> > > > --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
> > > > +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
> > > > @@ -190,7 +190,7 @@ static netdev_features_t ipoib_fix_features(struct 
> > > > net_device *dev, netdev_featu
> > > >         struct ipoib_dev_priv *priv = netdev_priv(dev);
> > > >  
> > > >         if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags))
> > > > -               features &= ~(NETIF_F_SG | NETIF_F_IP_CSUM | 
> > > > NETIF_F_TSO);
> > > > +               features &= ~(NETIF_F_IP_CSUM | NETIF_F_TSO);
> > > >  
> > > >         return features;
> > > >  }
> > > > @@ -233,7 +233,6 @@ int ipoib_set_mode(struct net_device *dev, const 
> > > > char *buf)
> > > >                            "will cause multicast packet drops\n");
> > > >                 netdev_update_features(dev);
> > > >                 rtnl_unlock();
> > > > -               priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM;
> > > 
> > > Sure about this removal ? I don't see it related to scatter-gather.
> > > 
> > > >  
> > > >                 ipoib_flush_paths(dev);
> > > >                 rtnl_lock();
> > > 
> > > Regards.
> > > 
> > > -- 
> > > Yann Droneaud
> > > OPTEYA
> > > 
> > > 
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to