Author: hselasky
Date: Fri Jun  3 09:03:44 2016
New Revision: 301258
URL: https://svnweb.freebsd.org/changeset/base/301258

Log:
  MFC r300277:
  Implement TX completion event interleaving.
  
  This patch implements a sysctl which allows setting a factor, N, for
  how many work queue elements can be generated before requiring a
  completion event. When a completion event happens the code simulates N
  completion events instead of only one. When draining a transmit queue,
  N-1 NOPs are transmitted at most, to force generation of the final
  completion event.  Further a timer is running every HZ ticks to flush
  any remaining data off the transmit queue when the tx_completion_fact
  > 1.
  
  The goal of this feature is to reduce the PCI bandwidth needed when
  transmitting data.
  
  Sponsored by: Mellanox Technologies
  Tested by:    Netflix

Modified:
  stable/10/sys/dev/mlx5/mlx5_en/en.h
  stable/10/sys/dev/mlx5/mlx5_en/mlx5_en_ethtool.c
  stable/10/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
  stable/10/sys/dev/mlx5/mlx5_en/mlx5_en_tx.c
Directory Properties:
  stable/10/   (props changed)

Modified: stable/10/sys/dev/mlx5/mlx5_en/en.h
==============================================================================
--- stable/10/sys/dev/mlx5/mlx5_en/en.h Fri Jun  3 09:03:10 2016        
(r301257)
+++ stable/10/sys/dev/mlx5/mlx5_en/en.h Fri Jun  3 09:03:44 2016        
(r301258)
@@ -393,6 +393,8 @@ struct mlx5e_params {
   m(+1, u64 tx_coalesce_usecs, "tx_coalesce_usecs", "Limit in usec for joining 
tx packets") \
   m(+1, u64 tx_coalesce_pkts, "tx_coalesce_pkts", "Maximum number of tx 
packets to join") \
   m(+1, u64 tx_coalesce_mode, "tx_coalesce_mode", "0: EQE mode 1: CQE mode") \
+  m(+1, u64 tx_completion_fact, "tx_completion_fact", "1..MAX: Completion 
event ratio") \
+  m(+1, u64 tx_completion_fact_max, "tx_completion_fact_max", "Maximum 
completion event ratio") \
   m(+1, u64 hw_lro, "hw_lro", "set to enable hw_lro") \
   m(+1, u64 cqe_zipping, "cqe_zipping", "0 : CQE zipping disabled")
 
@@ -498,6 +500,13 @@ struct mlx5e_sq {
        /* dirtied @xmit */
        u16     pc __aligned(MLX5E_CACHELINE_SIZE);
        u16     bf_offset;
+       u16     cev_counter;            /* completion event counter */
+       u16     cev_factor;             /* completion event factor */
+       u32     cev_next_state;         /* next completion event state */
+#define        MLX5E_CEV_STATE_INITIAL 0       /* timer not started */
+#define        MLX5E_CEV_STATE_SEND_NOPS 1     /* send NOPs */
+#define        MLX5E_CEV_STATE_HOLD_NOPS 2     /* don't send NOPs yet */
+       struct callout cev_callout;
        struct  mlx5e_sq_stats stats;
 
        struct  mlx5e_cq cq;
@@ -789,6 +798,7 @@ void        mlx5e_create_stats(struct sysctl_ct
     struct sysctl_oid_list *, const char *,
     const char **, unsigned, u64 *);
 void   mlx5e_send_nop(struct mlx5e_sq *, u32, bool);
+void   mlx5e_sq_cev_timeout(void *);
 int    mlx5e_refresh_channel_params(struct mlx5e_priv *);
 
 #endif                                 /* _MLX5_EN_H_ */

Modified: stable/10/sys/dev/mlx5/mlx5_en/mlx5_en_ethtool.c
==============================================================================
--- stable/10/sys/dev/mlx5/mlx5_en/mlx5_en_ethtool.c    Fri Jun  3 09:03:10 
2016        (r301257)
+++ stable/10/sys/dev/mlx5/mlx5_en/mlx5_en_ethtool.c    Fri Jun  3 09:03:44 
2016        (r301258)
@@ -48,6 +48,42 @@ mlx5e_create_stats(struct sysctl_ctx_lis
        }
 }
 
+static void
+mlx5e_ethtool_sync_tx_completion_fact(struct mlx5e_priv *priv)
+{
+       /*
+        * Limit the maximum distance between completion events to
+        * half of the currently set TX queue size.
+        *
+        * The maximum number of queue entries a single IP packet can
+        * consume is given by MLX5_SEND_WQE_MAX_WQEBBS.
+        *
+        * The worst case max value is then given as below:
+        */
+       uint64_t max = priv->params_ethtool.tx_queue_size /
+           (2 * MLX5_SEND_WQE_MAX_WQEBBS);
+
+       /*
+        * Update the maximum completion factor value in case the
+        * tx_queue_size field changed. Ensure we don't overflow
+        * 16-bits.
+        */
+       if (max < 1)
+               max = 1;
+       else if (max > 65535)
+               max = 65535;
+       priv->params_ethtool.tx_completion_fact_max = max;
+
+       /*
+        * Verify that the current TX completion factor is within the
+        * given limits:
+        */
+       if (priv->params_ethtool.tx_completion_fact < 1)
+               priv->params_ethtool.tx_completion_fact = 1;
+       else if (priv->params_ethtool.tx_completion_fact > max)
+               priv->params_ethtool.tx_completion_fact = max;
+}
+
 static int
 mlx5e_ethtool_handler(SYSCTL_HANDLER_ARGS)
 {
@@ -206,6 +242,14 @@ mlx5e_ethtool_handler(SYSCTL_HANDLER_ARG
                        priv->params_ethtool.cqe_zipping = 0;
                }
        }
+
+       if (&priv->params_ethtool.arg[arg2] ==
+           &priv->params_ethtool.tx_completion_fact ||
+           &priv->params_ethtool.arg[arg2] ==
+           &priv->params_ethtool.tx_queue_size) {
+               /* verify parameter */
+               mlx5e_ethtool_sync_tx_completion_fact(priv);
+       }
        if (was_opened)
                mlx5e_open_locked(priv->ifp);
 done:
@@ -475,6 +519,7 @@ mlx5e_create_ethtool(struct mlx5e_priv *
        priv->params_ethtool.tx_coalesce_pkts = 
priv->params.tx_cq_moderation_pkts;
        priv->params_ethtool.hw_lro = priv->params.hw_lro_en;
        priv->params_ethtool.cqe_zipping = priv->params.cqe_zipping_en;
+       mlx5e_ethtool_sync_tx_completion_fact(priv);
 
        /* create root node */
        node = SYSCTL_ADD_NODE(&priv->sysctl_ctx,

Modified: stable/10/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
==============================================================================
--- stable/10/sys/dev/mlx5/mlx5_en/mlx5_en_main.c       Fri Jun  3 09:03:10 
2016        (r301257)
+++ stable/10/sys/dev/mlx5/mlx5_en/mlx5_en_main.c       Fri Jun  3 09:03:44 
2016        (r301258)
@@ -1185,24 +1185,82 @@ err_destroy_sq:
 }
 
 static void
-mlx5e_close_sq(struct mlx5e_sq *sq)
+mlx5e_sq_send_nops_locked(struct mlx5e_sq *sq, int can_sleep)
 {
-
-       /* ensure hw is notified of all pending wqes */
-       if (mlx5e_sq_has_room_for(sq, 1))
+       /* fill up remainder with NOPs */
+       while (sq->cev_counter != 0) {
+               while (!mlx5e_sq_has_room_for(sq, 1)) {
+                       if (can_sleep != 0) {
+                               mtx_unlock(&sq->lock);
+                               msleep(4);
+                               mtx_lock(&sq->lock);
+                       } else {
+                               goto done;
+                       }
+               }
                mlx5e_send_nop(sq, 1, true);
+       }
+done:
+       return;
+}
 
-       mlx5e_modify_sq(sq, MLX5_SQC_STATE_RDY, MLX5_SQC_STATE_ERR);
+void
+mlx5e_sq_cev_timeout(void *arg)
+{
+       struct mlx5e_sq *sq = arg;
+
+       mtx_assert(&sq->lock, MA_OWNED);
+
+       /* check next state */
+       switch (sq->cev_next_state) {
+       case MLX5E_CEV_STATE_SEND_NOPS:
+               /* fill TX ring with NOPs, if any */
+               mlx5e_sq_send_nops_locked(sq, 0);
+
+               /* check if completed */
+               if (sq->cev_counter == 0) {
+                       sq->cev_next_state = MLX5E_CEV_STATE_INITIAL;
+                       return;
+               }
+               break;
+       default:
+               /* send NOPs on next timeout */
+               sq->cev_next_state = MLX5E_CEV_STATE_SEND_NOPS;
+               break;
+       }
+
+       /* restart timer */
+       callout_reset_curcpu(&sq->cev_callout, hz, mlx5e_sq_cev_timeout, sq);
 }
 
 static void
 mlx5e_close_sq_wait(struct mlx5e_sq *sq)
 {
+
+       mtx_lock(&sq->lock);
+       /* teardown event factor timer, if any */
+       sq->cev_next_state = MLX5E_CEV_STATE_HOLD_NOPS;
+       callout_stop(&sq->cev_callout);
+
+       /* send dummy NOPs in order to flush the transmit ring */
+       mlx5e_sq_send_nops_locked(sq, 1);
+       mtx_unlock(&sq->lock);
+
+       /* make sure it is safe to free the callout */
+       callout_drain(&sq->cev_callout);
+
+       /* error out remaining requests */
+       mlx5e_modify_sq(sq, MLX5_SQC_STATE_RDY, MLX5_SQC_STATE_ERR);
+
        /* wait till SQ is empty */
+       mtx_lock(&sq->lock);
        while (sq->cc != sq->pc) {
+               mtx_unlock(&sq->lock);
                msleep(4);
                sq->cq.mcq.comp(&sq->cq.mcq);
+               mtx_lock(&sq->lock);
        }
+       mtx_unlock(&sq->lock);
 
        mlx5e_disable_sq(sq);
        mlx5e_destroy_sq(sq);
@@ -1412,24 +1470,13 @@ mlx5e_open_sqs(struct mlx5e_channel *c,
        return (0);
 
 err_close_sqs:
-       for (tc--; tc >= 0; tc--) {
-               mlx5e_close_sq(&c->sq[tc]);
+       for (tc--; tc >= 0; tc--)
                mlx5e_close_sq_wait(&c->sq[tc]);
-       }
 
        return (err);
 }
 
 static void
-mlx5e_close_sqs(struct mlx5e_channel *c)
-{
-       int tc;
-
-       for (tc = 0; tc < c->num_tc; tc++)
-               mlx5e_close_sq(&c->sq[tc]);
-}
-
-static void
 mlx5e_close_sqs_wait(struct mlx5e_channel *c)
 {
        int tc;
@@ -1446,9 +1493,19 @@ mlx5e_chan_mtx_init(struct mlx5e_channel
        mtx_init(&c->rq.mtx, "mlx5rx", MTX_NETWORK_LOCK, MTX_DEF);
 
        for (tc = 0; tc < c->num_tc; tc++) {
-               mtx_init(&c->sq[tc].lock, "mlx5tx", MTX_NETWORK_LOCK, MTX_DEF);
-               mtx_init(&c->sq[tc].comp_lock, "mlx5comp", MTX_NETWORK_LOCK,
+               struct mlx5e_sq *sq = c->sq + tc;
+
+               mtx_init(&sq->lock, "mlx5tx", MTX_NETWORK_LOCK, MTX_DEF);
+               mtx_init(&sq->comp_lock, "mlx5comp", MTX_NETWORK_LOCK,
                    MTX_DEF);
+
+               callout_init_mtx(&sq->cev_callout, &sq->lock, 0);
+
+               sq->cev_factor = c->priv->params_ethtool.tx_completion_fact;
+
+               /* ensure the TX completion event factor is not zero */
+               if (sq->cev_factor == 0)
+                       sq->cev_factor = 1;
        }
 }
 
@@ -1529,7 +1586,6 @@ mlx5e_open_channel(struct mlx5e_priv *pr
        return (0);
 
 err_close_sqs:
-       mlx5e_close_sqs(c);
        mlx5e_close_sqs_wait(c);
 
 err_close_rx_cq:
@@ -1554,7 +1610,6 @@ mlx5e_close_channel(struct mlx5e_channel
        if (c == NULL)
                return;
        mlx5e_close_rq(&c->rq);
-       mlx5e_close_sqs(c);
 }
 
 static void

Modified: stable/10/sys/dev/mlx5/mlx5_en/mlx5_en_tx.c
==============================================================================
--- stable/10/sys/dev/mlx5/mlx5_en/mlx5_en_tx.c Fri Jun  3 09:03:10 2016        
(r301257)
+++ stable/10/sys/dev/mlx5/mlx5_en/mlx5_en_tx.c Fri Jun  3 09:03:44 2016        
(r301258)
@@ -28,6 +28,18 @@
 #include "en.h"
 #include <machine/atomic.h>
 
+static inline bool
+mlx5e_do_send_cqe(struct mlx5e_sq *sq)
+{
+       sq->cev_counter++;
+       /* interleave the CQEs */
+       if (sq->cev_counter >= sq->cev_factor) {
+               sq->cev_counter = 0;
+               return (1);
+       }
+       return (0);
+}
+
 void
 mlx5e_send_nop(struct mlx5e_sq *sq, u32 ds_cnt, bool notify_hw)
 {
@@ -38,7 +50,10 @@ mlx5e_send_nop(struct mlx5e_sq *sq, u32 
 
        wqe->ctrl.opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | 
MLX5_OPCODE_NOP);
        wqe->ctrl.qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_cnt);
-       wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
+       if (mlx5e_do_send_cqe(sq))
+               wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
+       else
+               wqe->ctrl.fm_ce_se = 0;
 
        sq->mbuf[pi].mbuf = NULL;
        sq->mbuf[pi].num_bytes = 0;
@@ -340,7 +355,10 @@ skip_dma:
 
        wqe->ctrl.opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | opcode);
        wqe->ctrl.qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_cnt);
-       wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
+       if (mlx5e_do_send_cqe(sq))
+               wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
+       else
+               wqe->ctrl.fm_ce_se = 0;
 
        /* Store pointer to mbuf */
        sq->mbuf[pi].mbuf = mb;
@@ -374,9 +392,10 @@ mlx5e_poll_tx_cq(struct mlx5e_sq *sq, in
         */
        sqcc = sq->cc;
 
-       while (budget--) {
+       while (budget > 0) {
                struct mlx5_cqe64 *cqe;
                struct mbuf *mb;
+               u16 x;
                u16 ci;
 
                cqe = mlx5e_get_cqe(&sq->cq);
@@ -385,24 +404,29 @@ mlx5e_poll_tx_cq(struct mlx5e_sq *sq, in
 
                mlx5_cqwq_pop(&sq->cq.wq);
 
-               ci = sqcc & sq->wq.sz_m1;
-               mb = sq->mbuf[ci].mbuf;
-               sq->mbuf[ci].mbuf = NULL;       /* Safety clear */
+               /* update budget according to the event factor */
+               budget -= sq->cev_factor;
 
-               if (mb == NULL) {
-                       if (sq->mbuf[ci].num_bytes == 0) {
-                               /* NOP */
-                               sq->stats.nop++;
-                       }
-               } else {
-                       bus_dmamap_sync(sq->dma_tag, sq->mbuf[ci].dma_map,
-                           BUS_DMASYNC_POSTWRITE);
-                       bus_dmamap_unload(sq->dma_tag, sq->mbuf[ci].dma_map);
+               for (x = 0; x != sq->cev_factor; x++) {
+                       ci = sqcc & sq->wq.sz_m1;
+                       mb = sq->mbuf[ci].mbuf;
+                       sq->mbuf[ci].mbuf = NULL;       /* Safety clear */
+
+                       if (mb == NULL) {
+                               if (sq->mbuf[ci].num_bytes == 0) {
+                                       /* NOP */
+                                       sq->stats.nop++;
+                               }
+                       } else {
+                               bus_dmamap_sync(sq->dma_tag, 
sq->mbuf[ci].dma_map,
+                                   BUS_DMASYNC_POSTWRITE);
+                               bus_dmamap_unload(sq->dma_tag, 
sq->mbuf[ci].dma_map);
 
-                       /* Free transmitted mbuf */
-                       m_freem(mb);
+                               /* Free transmitted mbuf */
+                               m_freem(mb);
+                       }
+                       sqcc += sq->mbuf[ci].num_wqebbs;
                }
-               sqcc += sq->mbuf[ci].num_wqebbs;
        }
 
        mlx5_cqwq_update_db_record(&sq->cq.wq);
@@ -450,6 +474,18 @@ mlx5e_xmit_locked(struct ifnet *ifp, str
                if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
                        break;
        }
+       /*
+        * Check if we need to start the event timer which flushes the
+        * transmit ring on timeout:
+        */
+       if (unlikely(sq->cev_next_state == MLX5E_CEV_STATE_INITIAL &&
+           sq->cev_factor != 1)) {
+               /* start the timer */
+               mlx5e_sq_cev_timeout(sq);
+       } else {
+               /* don't send NOPs yet */
+               sq->cev_next_state = MLX5E_CEV_STATE_HOLD_NOPS;
+       }
        return (err);
 }
 
_______________________________________________
svn-src-stable-10@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-stable-10
To unsubscribe, send any mail to "svn-src-stable-10-unsubscr...@freebsd.org"

Reply via email to