Hi,
Please see inline.

On Wednesday, October 25, 2017 7:50 PM Adrien Mazarguil wrote:
> 
> Hi Ophir,
> 
> On Mon, Oct 23, 2017 at 02:22:00PM +0000, Ophir Munk wrote:
> > This commit optimizes handling of one segment and calls a dedicated
> > function for handling multi segments
> >
> > Signed-off-by: Ophir Munk <ophi...@mellanox.com>
> 
> While it indeed moves the code to a separate function I'm not sure by how
> much it improves performance.
> 
> Is it noticeably better, can you provide a short performance summary with
> and without this patch? Is that the case for both single and multi-segment
> scenarios, or was this improvement at the cost of a degradation in the latter
> case?
> 

In v3 this commit is squashed into the previous commit "net/mlx4: improve 
performance of one Tx segment" 
as both commits represent one logic unit.
On Matan's setup performance improvement with those 2 commits occurs for both 
single and multi-segment scenarios.
On my setup performance improvement occurs for single segment only:
With patch versus without patch: 
64              +0.2 mpps
64,64           -0.2 mpps
64,64,64,64     -0.07 mpps

> If it splits a large function in two smaller ones for readability and no
> performance validation was done on this specific patch alone, please not
> label it as a performance improvement. I'm fine with readability
> improvements when properly identified as such.
> 

Performance improvement indication was removed from commit message.

> A few additional comments below.
> 
> > ---
> >  drivers/net/mlx4/mlx4_rxtx.c | 284
> > +++++++++++++++++++++++--------------------
> >  1 file changed, 154 insertions(+), 130 deletions(-)
> >
> > diff --git a/drivers/net/mlx4/mlx4_rxtx.c
> > b/drivers/net/mlx4/mlx4_rxtx.c index 3236552..9596859 100644
> > --- a/drivers/net/mlx4/mlx4_rxtx.c
> > +++ b/drivers/net/mlx4/mlx4_rxtx.c
> > @@ -62,6 +62,9 @@
> >  #include "mlx4_rxtx.h"
> >  #include "mlx4_utils.h"
> >
> > +#define WQE_ONE_DATA_SEG_SIZE \
> > +   (sizeof(struct mlx4_wqe_ctrl_seg) + sizeof(struct
> > +mlx4_wqe_data_seg))
> > +
> >  /**
> >   * Pointer-value pair structure used in tx_post_send for saving the first
> >   * DWORD (32 byte) of a TXBB.
> > @@ -140,22 +143,19 @@ mlx4_txq_stamp_freed_wqe(struct mlx4_sq *sq,
> uint16_t index, uint8_t owner)
> >   * @return
> >   *   0 on success, -1 on failure.
> >   */
> > -static int
> > -mlx4_txq_complete(struct txq *txq)
> > +static inline int __attribute__((always_inline))
> 
> Should be static only, leave the rest to the compiler. This function is large
> enough that it shouldn't make much of a difference anyway (unless proved
> otherwise).
> 

Done.
__attribute__((always_inline)) was removed.

> > +mlx4_txq_complete(struct txq *txq, const unsigned int elts_n,
> > +                           struct mlx4_sq *sq)
> >  {
> >     unsigned int elts_comp = txq->elts_comp;
> >     unsigned int elts_tail = txq->elts_tail;
> > -   const unsigned int elts_n = txq->elts_n;
> >     struct mlx4_cq *cq = &txq->mcq;
> > -   struct mlx4_sq *sq = &txq->msq;
> >     struct mlx4_cqe *cqe;
> >     uint32_t cons_index = cq->cons_index;
> >     uint16_t new_index;
> >     uint16_t nr_txbbs = 0;
> >     int pkts = 0;
> >
> > -   if (unlikely(elts_comp == 0))
> > -           return 0;
> >     /*
> >      * Traverse over all CQ entries reported and handle each WQ entry
> >      * reported by them.
> > @@ -266,6 +266,120 @@ rte_be32_t mlx4_txq_add_mr(struct txq *txq,
> struct rte_mempool *mp, uint32_t i)
> >     return txq->mp2mr[i].lkey;
> >  }
> >
> > +static int handle_multi_segs(struct rte_mbuf *buf,
> > +                       struct txq *txq,
> > +                       struct mlx4_wqe_ctrl_seg **pctrl) {
> > +   int wqe_real_size;
> > +   int nr_txbbs;
> > +   struct pv *pv = (struct pv *)txq->bounce_buf;
> > +   struct mlx4_sq *sq = &txq->msq;
> > +   uint32_t head_idx = sq->head & sq->txbb_cnt_mask;
> > +   struct mlx4_wqe_ctrl_seg *ctrl;
> > +   struct mlx4_wqe_data_seg *dseg;
> > +   uintptr_t addr;
> > +   uint32_t byte_count;
> > +   int pv_counter = 0;
> > +
> > +   /* Calculate the needed work queue entry size for this packet. */
> > +   wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
> > +           buf->nb_segs * sizeof(struct mlx4_wqe_data_seg);
> > +   nr_txbbs = MLX4_SIZE_TO_TXBBS(wqe_real_size);
> > +   /*
> > +    * Check that there is room for this WQE in the send queue and that
> > +    * the WQE size is legal.
> > +    */
> > +   if (((sq->head - sq->tail) + nr_txbbs +
> > +                           sq->headroom_txbbs) >= sq->txbb_cnt ||
> > +                   nr_txbbs > MLX4_MAX_WQE_TXBBS) {
> > +           return -1;
> > +   }
> > +
> > +   /* Get the control and data entries of the WQE. */
> > +   ctrl = (struct mlx4_wqe_ctrl_seg *)mlx4_get_send_wqe(sq,
> head_idx);
> > +   dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
> > +                   sizeof(struct mlx4_wqe_ctrl_seg));
> > +   *pctrl = ctrl;
> > +   /* Fill the data segments with buffer information. */
> > +   struct rte_mbuf *sbuf;
> > +
> > +   for (sbuf = buf; sbuf != NULL; sbuf = sbuf->next, dseg++) {
> > +           addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
> > +           rte_prefetch0((volatile void *)addr);
> > +           /* Handle WQE wraparound. */
> > +           if (unlikely(dseg >= (struct mlx4_wqe_data_seg *)sq->eob))
> > +                   dseg = (struct mlx4_wqe_data_seg *)sq->buf;
> > +           dseg->addr = rte_cpu_to_be_64(addr);
> > +           /* Memory region key (big endian) for this memory pool. */
> > +           dseg->lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(sbuf));
> #ifndef
> > +NDEBUG
> > +           /* Calculate the needed work queue entry size for this packet
> */
> > +           if (unlikely(dseg->lkey == rte_cpu_to_be_32((uint32_t)-1))) {
> > +                   /* MR does not exist. */
> > +                   DEBUG("%p: unable to get MP <-> MR association",
> > +                                   (void *)txq);
> > +                   /*
> > +                    * Restamp entry in case of failure.
> > +                    * Make sure that size is written correctly
> > +                    * Note that we give ownership to the SW, not the
> HW.
> > +                    */
> > +                   wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
> > +                           buf->nb_segs * sizeof(struct
> mlx4_wqe_data_seg);
> > +                   ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
> > +                   mlx4_txq_stamp_freed_wqe(sq, head_idx,
> > +                                   (sq->head & sq->txbb_cnt) ? 0 : 1);
> > +                   return -1;
> > +           }
> > +#endif /* NDEBUG */
> > +           if (likely(sbuf->data_len)) {
> > +                   byte_count = rte_cpu_to_be_32(sbuf->data_len);
> > +           } else {
> > +                   /*
> > +                    * Zero length segment is treated as inline segment
> > +                    * with zero data.
> > +                    */
> > +                   byte_count = RTE_BE32(0x80000000);
> > +           }
> > +           /*
> > +            * If the data segment is not at the beginning of a
> > +            * Tx basic block (TXBB) then write the byte count,
> > +            * else postpone the writing to just before updating the
> > +            * control segment.
> > +            */
> > +           if ((uintptr_t)dseg & (uintptr_t)(MLX4_TXBB_SIZE - 1)) {
> > +                   /*
> > +                    * Need a barrier here before writing the byte_count
> > +                    * fields to make sure that all the data is visible
> > +                    * before the byte_count field is set.
> > +                    * Otherwise, if the segment begins a new cacheline,
> > +                    * the HCA prefetcher could grab the 64-byte chunk
> and
> > +                    * get a valid (!= 0xffffffff) byte count but stale
> > +                    * data, and end up sending the wrong data.
> > +                    */
> > +                   rte_io_wmb();
> > +                   dseg->byte_count = byte_count;
> > +           } else {
> > +                   /*
> > +                    * This data segment starts at the beginning of a new
> > +                    * TXBB, so we need to postpone its byte_count
> writing
> > +                    * for later.
> > +                    */
> > +                   pv[pv_counter].dseg = dseg;
> > +                   pv[pv_counter++].val = byte_count;
> > +           }
> > +   }
> > +   /* Write the first DWORD of each TXBB save earlier. */
> > +   if (pv_counter) {
> > +           /* Need a barrier here before writing the byte_count. */
> > +           rte_io_wmb();
> > +           for (--pv_counter; pv_counter  >= 0; pv_counter--)
> > +                   pv[pv_counter].dseg->byte_count =
> pv[pv_counter].val;
> > +   }
> > +   /* Fill the control parameters for this packet. */
> > +   ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
> > +
> > +   return nr_txbbs;
> > +}
> >  /**
> >   * DPDK callback for Tx.
> >   *
> > @@ -288,10 +402,11 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf
> **pkts, uint16_t pkts_n)
> >     unsigned int i;
> >     unsigned int max;
> >     struct mlx4_sq *sq = &txq->msq;
> > -   struct pv *pv = (struct pv *)txq->bounce_buf;
> > +   int nr_txbbs;
> >
> >     assert(txq->elts_comp_cd != 0);
> > -   mlx4_txq_complete(txq);
> > +   if (likely(txq->elts_comp != 0))
> > +           mlx4_txq_complete(txq, elts_n, sq);
> >     max = (elts_n - (elts_head - txq->elts_tail));
> >     if (max > elts_n)
> >             max -= elts_n;
> > @@ -316,10 +431,6 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf
> **pkts, uint16_t pkts_n)
> >             } srcrb;
> >             uint32_t head_idx = sq->head & sq->txbb_cnt_mask;
> >             uintptr_t addr;
> > -           uint32_t byte_count;
> > -           int wqe_real_size;
> > -           int nr_txbbs;
> > -           int pv_counter = 0;
> >
> >             /* Clean up old buffer. */
> >             if (likely(elt->buf != NULL)) {
> > @@ -338,31 +449,22 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf
> **pkts, uint16_t pkts_n)
> >                     } while (tmp != NULL);
> >             }
> >             RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
> > -
> > -           /*
> > -            * Calculate the needed work queue entry size
> > -            * for this packet.
> > -            */
> > -           wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
> > -                           buf->nb_segs * sizeof(struct
> mlx4_wqe_data_seg);
> > -           nr_txbbs = MLX4_SIZE_TO_TXBBS(wqe_real_size);
> > -           /*
> > -            * Check that there is room for this WQE in the send
> > -            * queue and that the WQE size is legal.
> > -            */
> > -           if (((sq->head - sq->tail) + nr_txbbs +
> > -                sq->headroom_txbbs) >= sq->txbb_cnt ||
> > -               nr_txbbs > MLX4_MAX_WQE_TXBBS) {
> > -                   elt->buf = NULL;
> > -                   break;
> > -           }
> > -           /* Get the control and data entries of the WQE. */
> > -           ctrl = (struct mlx4_wqe_ctrl_seg *)
> > -                           mlx4_get_send_wqe(sq, head_idx);
> > -           dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
> > -                           sizeof(struct mlx4_wqe_ctrl_seg));
> > -           /* Fill the data segments with buffer information. */
> >             if (likely(buf->nb_segs == 1)) {
> > +                   /*
> > +                    * Check that there is room for this WQE in the send
> > +                    * queue and that the WQE size is legal
> > +                    */
> > +                   if (((sq->head - sq->tail) + 1 + sq->headroom_txbbs)
> > +                                           >= sq->txbb_cnt ||
> > +                                           1 >
> MLX4_MAX_WQE_TXBBS) {
> > +                           elt->buf = NULL;
> > +                           break;
> > +                   }
> > +                   /* Get the control and data entries of the WQE. */
> > +                   ctrl = (struct mlx4_wqe_ctrl_seg *)
> > +                                   mlx4_get_send_wqe(sq, head_idx);
> > +                   dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
> > +                                   sizeof(struct mlx4_wqe_ctrl_seg));
> >                     addr = rte_pktmbuf_mtod(buf, uintptr_t);
> >                     rte_prefetch0((volatile void *)addr);
> >                     /* Handle WQE wraparound. */
> > @@ -371,120 +473,42 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf
> **pkts, uint16_t pkts_n)
> >                             dseg = (struct mlx4_wqe_data_seg *)sq->buf;
> >                     dseg->addr = rte_cpu_to_be_64(addr);
> >                     /* Memory region key (big endian). */
> > -                   dseg->lkey = mlx4_txq_mp2mr(txq,
> mlx4_txq_mb2mp(sbuf));
> > -   #ifndef NDEBUG
> > +                   dseg->lkey = mlx4_txq_mp2mr(txq,
> mlx4_txq_mb2mp(buf)); #ifndef
> > +NDEBUG
> >                     if (unlikely(dseg->lkey ==
> >                             rte_cpu_to_be_32((uint32_t)-1))) {
> >                             /* MR does not exist. */
> >                             DEBUG("%p: unable to get MP <-> MR
> association",
> > -                                 (void *)txq);
> > +                                           (void *)txq);
> >                             /*
> >                              * Restamp entry in case of failure.
> >                              * Make sure that size is written correctly
> >                              * Note that we give ownership to the SW,
> >                              * not the HW.
> >                              */
> > -                           ctrl->fence_size = (wqe_real_size >> 4) &
> 0x3f;
> > +                           ctrl->fence_size =
> (WQE_ONE_DATA_SEG_SIZE >> 4)
> > +                                                   & 0x3f;
> >                             mlx4_txq_stamp_freed_wqe(sq, head_idx,
> > -                                        (sq->head & sq->txbb_cnt) ? 0 : 1);
> > +                                   (sq->head & sq->txbb_cnt) ? 0 : 1);
> >                             elt->buf = NULL;
> >                             break;
> >                     }
> > -   #endif /* NDEBUG */
> > +#endif /* NDEBUG */
> >                     /* Need a barrier here before writing the
> byte_count. */
> >                     rte_io_wmb();
> >                     dseg->byte_count = rte_cpu_to_be_32(buf-
> >data_len);
> > +
> > +                   /* Fill the control parameters for this packet. */
> > +                   ctrl->fence_size = (WQE_ONE_DATA_SEG_SIZE >> 4)
> & 0x3f;
> > +                   nr_txbbs = 1;
> >             } else {
> > -                   /* Fill the data segments with buffer information. */
> > -                   struct rte_mbuf *sbuf;
> > -
> > -                   for (sbuf = buf;
> > -                            sbuf != NULL;
> > -                            sbuf = sbuf->next, dseg++) {
> > -                           addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
> > -                           rte_prefetch0((volatile void *)addr);
> > -                           /* Handle WQE wraparound. */
> > -                           if (unlikely(dseg >=
> > -                                   (struct mlx4_wqe_data_seg *)sq-
> >eob))
> > -                                   dseg = (struct mlx4_wqe_data_seg *)
> > -                                                   sq->buf;
> > -                           dseg->addr = rte_cpu_to_be_64(addr);
> > -                           /* Memory region key (big endian). */
> > -                           dseg->lkey = mlx4_txq_mp2mr(txq,
> > -                                           mlx4_txq_mb2mp(sbuf));
> > -           #ifndef NDEBUG
> > -                           if (unlikely(dseg->lkey ==
> > -                                   rte_cpu_to_be_32((uint32_t)-1))) {
> > -                                   /* MR does not exist. */
> > -                                   DEBUG("%p: unable to get MP <->
> MR association",
> > -                                             (void *)txq);
> > -                                   /*
> > -                                    * Restamp entry in case of failure.
> > -                                    * Make sure that size is written
> > -                                    * correctly, note that we give
> > -                                    * ownership to the SW, not the HW.
> > -                                    */
> > -                                   ctrl->fence_size =
> > -                                           (wqe_real_size >> 4) & 0x3f;
> > -                                   mlx4_txq_stamp_freed_wqe(sq,
> head_idx,
> > -                                       (sq->head & sq->txbb_cnt) ? 0 : 1);
> > -                                   elt->buf = NULL;
> > -                                   break;
> > -                           }
> > -           #endif /* NDEBUG */
> > -                           if (likely(sbuf->data_len)) {
> > -                                   byte_count =
> > -                                     rte_cpu_to_be_32(sbuf->data_len);
> > -                           } else {
> > -                                   /*
> > -                                    * Zero length segment is treated as
> > -                                    * inline segment with zero data.
> > -                                    */
> > -                                   byte_count =
> RTE_BE32(0x80000000);
> > -                           }
> > -                           /*
> > -                            * If the data segment is not at the beginning
> > -                            * of a Tx basic block (TXBB) then write the
> > -                            * byte count, else postpone the writing to
> > -                            * just before updating the control segment.
> > -                            */
> > -                           if ((uintptr_t)dseg &
> > -                                   (uintptr_t)(MLX4_TXBB_SIZE - 1)) {
> > -                                   /*
> > -                                    * Need a barrier here before writing
> > -                                    * the byte_count fields to make sure
> > -                                    * that all the data is visible before
> > -                                    * the byte_count field is set.
> > -                                    * Otherwise, if the segment begins a
> > -                                    * new cacheline, the HCA prefetcher
> > -                                    * could grab the 64-byte chunk and
> get
> > -                                    * a valid (!= 0xffffffff) byte count
> > -                                    * but stale data, and end up sending
> > -                                    * the wrong data.
> > -                                    */
> > -                                   rte_io_wmb();
> > -                                   dseg->byte_count = byte_count;
> > -                           } else {
> > -                                   /*
> > -                                    * This data segment starts at the
> > -                                    * beginning of a new TXBB, so we
> > -                                    * need to postpone its byte_count
> > -                                    * writing for later.
> > -                                    */
> > -                                   pv[pv_counter].dseg = dseg;
> > -                                   pv[pv_counter++].val = byte_count;
> > -                           }
> > +                   nr_txbbs = handle_multi_segs(buf, txq, &ctrl);
> 
> Having all this part non-inline could degrade multi-segment performance, is
> that OK?

It is inline because it is defined as static. Performance is not degraded in 
this case.

> 
> > +                   if (nr_txbbs < 0) {
> > +                           elt->buf = NULL;
> > +                           break;
> >                     }
> > -           /* Write the first DWORD of each TXBB save earlier. */
> > -           if (pv_counter) {
> > -                   /* Need a barrier before writing the byte_count. */
> > -                   rte_io_wmb();
> > -                   for (--pv_counter; pv_counter  >= 0; pv_counter--)
> > -                           pv[pv_counter].dseg->byte_count =
> > -                                           pv[pv_counter].val;
> >             }
> > -           /* Fill the control parameters for this packet. */
> > -           ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
> > +
> >             /*
> >              * For raw Ethernet, the SOLICIT flag is used to indicate
> >              * that no ICRC should be calculated.
> > --
> > 2.7.4
> >
> 
> --
> Adrien Mazarguil
> 6WIND

Reply via email to