Author: davidcs
Date: Thu Aug 24 18:51:55 2017
New Revision: 322851
URL: https://svnweb.freebsd.org/changeset/base/322851

Log:
  MFC r322408
  Performance enhancements to reduce CPU utililization for large number of
  TCP connections (order of tens of thousands), with predominantly Transmits.
  
  Submitted by: vaishali.kulka...@cavium.com

Modified:
  stable/11/sys/dev/qlnx/qlnxe/qlnx_def.h
  stable/11/sys/dev/qlnx/qlnxe/qlnx_os.c
  stable/11/sys/dev/qlnx/qlnxe/qlnx_ver.h
  stable/11/sys/modules/qlnx/qlnxe/Makefile
Directory Properties:
  stable/11/   (props changed)

Modified: stable/11/sys/dev/qlnx/qlnxe/qlnx_def.h
==============================================================================
--- stable/11/sys/dev/qlnx/qlnxe/qlnx_def.h     Thu Aug 24 18:01:17 2017        
(r322850)
+++ stable/11/sys/dev/qlnx/qlnxe/qlnx_def.h     Thu Aug 24 18:51:55 2017        
(r322851)
@@ -50,9 +50,10 @@ struct qlnx_ivec {
 
 typedef struct qlnx_ivec qlnx_ivec_t;
 
-//#define QLNX_MAX_RSS 30
-#define QLNX_MAX_RSS   16
-#define QLNX_MAX_TC    1
+//#define QLNX_MAX_RSS         30
+#define QLNX_MAX_RSS           36
+#define QLNX_DEFAULT_RSS       16
+#define QLNX_MAX_TC            1
 
 enum QLNX_STATE {
         QLNX_STATE_CLOSED,
@@ -201,6 +202,17 @@ struct qlnx_fastpath {
        uint64_t                tx_pkts_freed;
        uint64_t                tx_pkts_transmitted;
        uint64_t                tx_pkts_completed;
+       uint64_t                tx_tso_pkts;
+       uint64_t                tx_non_tso_pkts;
+
+#ifdef QLNX_TRACE_PERF_DATA
+       uint64_t                tx_pkts_trans_ctx;
+       uint64_t                tx_pkts_compl_ctx;
+       uint64_t                tx_pkts_trans_fp;
+       uint64_t                tx_pkts_compl_fp;
+       uint64_t                tx_pkts_compl_intr;
+#endif
+
        uint64_t                tx_lso_wnd_min_len;
        uint64_t                tx_defrag;
        uint64_t                tx_nsegs_gt_elem_left;
@@ -209,6 +221,13 @@ struct qlnx_fastpath {
        uint32_t                tx_tso_max_pkt_len;
        uint32_t                tx_tso_min_pkt_len;
        uint64_t                tx_pkts[QLNX_FP_MAX_SEGS];
+
+#ifdef QLNX_TRACE_PERF_DATA
+       uint64_t                tx_pkts_hist[QLNX_FP_MAX_SEGS];
+       uint64_t                tx_comInt[QLNX_FP_MAX_SEGS];
+       uint64_t                tx_pkts_q[QLNX_FP_MAX_SEGS];
+#endif
+
        uint64_t                err_tx_nsegs_gt_elem_left;
         uint64_t                err_tx_dmamap_create;
         uint64_t                err_tx_defrag_dmamap_load;
@@ -301,8 +320,13 @@ typedef struct qlnx_link_output qlnx_link_output_t;
 #define QLNX_MFW_VERSION_LENGTH 32
 #define QLNX_STORMFW_VERSION_LENGTH 32
 
-#define QLNX_TX_ELEM_RESERVE   2
+#define QLNX_TX_ELEM_RESERVE           2
+#define QLNX_TX_ELEM_THRESH            128
+#define QLNX_TX_ELEM_MAX_THRESH                512
+#define QLNX_TX_ELEM_MIN_THRESH                32
+#define QLNX_TX_COMPL_THRESH           32
 
+
 #define QLNX_TPA_MAX_AGG_BUFFERS             (20)
 
 #define QLNX_MAX_NUM_MULTICAST_ADDRS   ECORE_MAX_MC_ADDRS
@@ -454,6 +478,7 @@ struct qlnx_host {
        qlnx_storm_stats_t      storm_stats[QLNX_STORM_STATS_TOTAL];
        uint32_t                storm_stats_index;
        uint32_t                storm_stats_enable;
+       uint32_t                storm_stats_gather;
 
        uint32_t                personality;
 };
@@ -470,8 +495,11 @@ typedef struct qlnx_host qlnx_host_t;
 
 #define QLNX_MAX_MTU                   9000
 #define QLNX_MAX_SEGMENTS_NON_TSO      (ETH_TX_MAX_BDS_PER_NON_LSO_PACKET - 1)
-#define QLNX_MAX_TSO_FRAME_SIZE                ((64 * 1024 - 1) + 22)
+//#define QLNX_MAX_TSO_FRAME_SIZE              ((64 * 1024 - 1) + 22)
+#define QLNX_MAX_TSO_FRAME_SIZE                65536
+#define QLNX_MAX_TX_MBUF_SIZE          65536    /* bytes - bd_len = 16bits */
 
+
 #define QL_MAC_CMP(mac1, mac2)    \
         ((((*(uint32_t *) mac1) == (*(uint32_t *) mac2) && \
         (*(uint16_t *)(mac1 + 4)) == (*(uint16_t *)(mac2 + 4)))) ? 0 : 1)
@@ -702,6 +730,18 @@ extern void qlnx_fill_link(struct ecore_hwfn *hwfn,
 #define CQE_HAS_VLAN(flags) \
         ((flags) & (PARSING_AND_ERR_FLAGS_TAG8021QEXIST_MASK \
                 << PARSING_AND_ERR_FLAGS_TAG8021QEXIST_SHIFT))
+
+#if defined(__i386__) || defined(__amd64__)
+
+static __inline
+void prefetch(void *x)
+{
+        __asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x));
+}
+
+#else
+#define prefetch(x)
+#endif
 
 
 #endif /* #ifndef _QLNX_DEF_H_ */

Modified: stable/11/sys/dev/qlnx/qlnxe/qlnx_os.c
==============================================================================
--- stable/11/sys/dev/qlnx/qlnxe/qlnx_os.c      Thu Aug 24 18:01:17 2017        
(r322850)
+++ stable/11/sys/dev/qlnx/qlnxe/qlnx_os.c      Thu Aug 24 18:51:55 2017        
(r322851)
@@ -94,6 +94,8 @@ static int qlnx_get_ifq_snd_maxlen(qlnx_host_t *ha);
 static uint32_t qlnx_get_optics(qlnx_host_t *ha,
                        struct qlnx_link_output *if_link);
 static int qlnx_transmit(struct ifnet *ifp, struct mbuf  *mp);
+static int qlnx_transmit_locked(struct ifnet *ifp, struct qlnx_fastpath *fp,
+               struct mbuf *mp);
 static void qlnx_qflush(struct ifnet *ifp);
 
 static int qlnx_alloc_parent_dma_tag(qlnx_host_t *ha);
@@ -133,6 +135,8 @@ static void qlnx_timer(void *arg);
 static int qlnx_alloc_tx_br(qlnx_host_t *ha, struct qlnx_fastpath *fp);
 static void qlnx_free_tx_br(qlnx_host_t *ha, struct qlnx_fastpath *fp);
 static void qlnx_trigger_dump(qlnx_host_t *ha);
+static uint16_t qlnx_num_tx_compl(qlnx_host_t *ha, struct qlnx_fastpath *fp,
+                       struct qlnx_tx_queue *txq);
 static void qlnx_tx_int(qlnx_host_t *ha, struct qlnx_fastpath *fp,
                struct qlnx_tx_queue *txq);
 static int qlnx_rx_int(qlnx_host_t *ha, struct qlnx_fastpath *fp, int budget,
@@ -215,6 +219,12 @@ char qlnx_name_str[NAME_SIZE];
 #define QLOGIC_PCI_DEVICE_ID_8070      0x8070
 #endif
 
+SYSCTL_NODE(_hw, OID_AUTO, qlnxe, CTLFLAG_RD, 0, "qlnxe driver parameters");
+/* Number of Queues: 0 (Auto) or 1 to 32 (fixed queue number) */
+static int qlnxe_queue_count = QLNX_DEFAULT_RSS;
+SYSCTL_INT(_hw_qlnxe, OID_AUTO, queue_count, CTLFLAG_RDTUN,
+               &qlnxe_queue_count, 0, "Multi-Queue queue count");
+
 static int
 qlnx_valid_device(device_t dev)
 {
@@ -302,7 +312,26 @@ qlnx_pci_probe(device_t dev)
         return (BUS_PROBE_DEFAULT);
 }
 
+static uint16_t
+qlnx_num_tx_compl(qlnx_host_t *ha, struct qlnx_fastpath *fp,
+       struct qlnx_tx_queue *txq)
+{
+       u16 hw_bd_cons;
+       u16 ecore_cons_idx;
+       uint16_t diff;
 
+       hw_bd_cons = le16toh(*txq->hw_cons_ptr);
+
+       ecore_cons_idx = ecore_chain_get_cons_idx(&txq->tx_pbl);
+       if (hw_bd_cons < ecore_cons_idx) {
+               diff = (1 << 16) - (ecore_cons_idx - hw_bd_cons);
+       } else {
+               diff = hw_bd_cons - ecore_cons_idx;
+       }
+       return diff;
+}
+
+
 static void
 qlnx_sp_intr(void *arg)
 {
@@ -395,14 +424,11 @@ qlnx_fp_taskqueue(void *context, int pending)
         struct qlnx_fastpath   *fp;
         qlnx_host_t            *ha;
         struct ifnet           *ifp;
-        struct mbuf            *mp;
-        int                    ret;
-       struct thread           *cthread;
 
 #ifdef QLNX_RCV_IN_TASKQ
        int                     lro_enable;
        int                     rx_int = 0, total_rx_count = 0;
-
+       struct thread           *cthread;
 #endif /* #ifdef QLNX_RCV_IN_TASKQ */
 
         fp = context;
@@ -410,6 +436,12 @@ qlnx_fp_taskqueue(void *context, int pending)
         if (fp == NULL)
                 return;
 
+        ha = (qlnx_host_t *)fp->edev;
+
+        ifp = ha->ifp;
+
+#ifdef QLNX_RCV_IN_TASKQ
+
        cthread = curthread;
 
        thread_lock(cthread);
@@ -419,112 +451,81 @@ qlnx_fp_taskqueue(void *context, int pending)
 
        thread_unlock(cthread);
 
-        ha = (qlnx_host_t *)fp->edev;
+       lro_enable = ifp->if_capenable & IFCAP_LRO;
 
-        ifp = ha->ifp;
+       rx_int = qlnx_rx_int(ha, fp, ha->rx_pkt_threshold, lro_enable);
 
-#ifdef QLNX_RCV_IN_TASKQ
-       {
-               lro_enable = ifp->if_capenable & IFCAP_LRO;
+       if (rx_int) {
+               fp->rx_pkts += rx_int;
+               total_rx_count += rx_int;
+       }
 
-               rx_int = qlnx_rx_int(ha, fp, ha->rx_pkt_threshold, lro_enable);
-
-               if (rx_int) {
-                       fp->rx_pkts += rx_int;
-                       total_rx_count += rx_int;
-               }
-
 #ifdef QLNX_SOFT_LRO
-               {
-                       struct lro_ctrl *lro;
-       
-                       lro = &fp->rxq->lro;
+       {
+               struct lro_ctrl *lro;
 
-                       if (lro_enable && total_rx_count) {
+               lro = &fp->rxq->lro;
 
+               if (lro_enable && total_rx_count) {
+
 #if (__FreeBSD_version >= 1100101) || (defined QLNX_QSORT_LRO)
 
-                               if (ha->dbg_trace_lro_cnt) {
-                                       if (lro->lro_mbuf_count & ~1023)
-                                               fp->lro_cnt_1024++;
-                                       else if (lro->lro_mbuf_count & ~511)
-                                               fp->lro_cnt_512++;
-                                       else if (lro->lro_mbuf_count & ~255)
-                                               fp->lro_cnt_256++;
-                                       else if (lro->lro_mbuf_count & ~127)
-                                               fp->lro_cnt_128++;
-                                       else if (lro->lro_mbuf_count & ~63)
-                                               fp->lro_cnt_64++;
-                               }
-                               tcp_lro_flush_all(lro);
+                       if (ha->dbg_trace_lro_cnt) {
+                               if (lro->lro_mbuf_count & ~1023)
+                                       fp->lro_cnt_1024++;
+                               else if (lro->lro_mbuf_count & ~511)
+                                       fp->lro_cnt_512++;
+                               else if (lro->lro_mbuf_count & ~255)
+                                       fp->lro_cnt_256++;
+                               else if (lro->lro_mbuf_count & ~127)
+                                       fp->lro_cnt_128++;
+                               else if (lro->lro_mbuf_count & ~63)
+                                       fp->lro_cnt_64++;
+                       }
+                       tcp_lro_flush_all(lro);
 
 #else
-                               struct lro_entry *queued;
+                       struct lro_entry *queued;
 
-                               while ((!SLIST_EMPTY(&lro->lro_active))) {
-                                       queued = SLIST_FIRST(&lro->lro_active);
-                                       SLIST_REMOVE_HEAD(&lro->lro_active, 
next);
-                                       tcp_lro_flush(lro, queued);
-                               }
-#endif /* #if (__FreeBSD_version >= 1100101) || (defined QLNX_QSORT_LRO) */
+                       while ((!SLIST_EMPTY(&lro->lro_active))) {
+                               queued = SLIST_FIRST(&lro->lro_active);
+                               SLIST_REMOVE_HEAD(&lro->lro_active, next);
+                               tcp_lro_flush(lro, queued);
                        }
+#endif /* #if (__FreeBSD_version >= 1100101) || (defined QLNX_QSORT_LRO) */
                }
+       }
 #endif /* #ifdef QLNX_SOFT_LRO */
 
-               ecore_sb_update_sb_idx(fp->sb_info);
-               rmb();
-       }
+       ecore_sb_update_sb_idx(fp->sb_info);
+       rmb();
 
 #endif /* #ifdef QLNX_RCV_IN_TASKQ */
 
-        mtx_lock(&fp->tx_mtx);
+        if(ifp->if_drv_flags & IFF_DRV_RUNNING) {
 
-        if (((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
-                IFF_DRV_RUNNING) || (!ha->link_up)) {
+                if (!drbr_empty(ifp, fp->tx_br)) {
 
-                mtx_unlock(&fp->tx_mtx);
-                goto qlnx_fp_taskqueue_exit;
-        }
+                        if(mtx_trylock(&fp->tx_mtx)) {
 
-        mp = drbr_peek(ifp, fp->tx_br);
+#ifdef QLNX_TRACE_PERF_DATA
+                                tx_pkts = fp->tx_pkts_transmitted;
+                                tx_compl = fp->tx_pkts_completed;
+#endif
 
-        while (mp != NULL) {
+                                qlnx_transmit_locked(ifp, fp, NULL);
 
-               if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
-                       ret = qlnx_send(ha, fp, &mp);
-               } else {
-                       ret = -1;
-               }
-
-                if (ret) {
-
-                        if (mp != NULL) {
-                                drbr_putback(ifp, fp->tx_br, mp);
-                        } else {
-                                fp->tx_pkts_processed++;
-                                drbr_advance(ifp, fp->tx_br);
+#ifdef QLNX_TRACE_PERF_DATA
+                                fp->tx_pkts_trans_fp +=
+                                       (fp->tx_pkts_transmitted - tx_pkts);
+                                fp->tx_pkts_compl_fp +=
+                                       (fp->tx_pkts_completed - tx_compl);
+#endif
+                                mtx_unlock(&fp->tx_mtx);
                         }
-
-                        mtx_unlock(&fp->tx_mtx);
-
-                        goto qlnx_fp_taskqueue_exit;
-
-                } else {
-                        drbr_advance(ifp, fp->tx_br);
-                        fp->tx_pkts_transmitted++;
-                        fp->tx_pkts_processed++;
                 }
-
-               if (fp->tx_ring_full)
-                       break;
-
-                mp = drbr_peek(ifp, fp->tx_br);
         }
 
-        mtx_unlock(&fp->tx_mtx);
-
-qlnx_fp_taskqueue_exit:
-
 #ifdef QLNX_RCV_IN_TASKQ
        if (rx_int) {
                if (fp->fp_taskqueue != NULL)
@@ -537,7 +538,7 @@ qlnx_fp_taskqueue_exit:
        }
 #endif /* #ifdef QLNX_RCV_IN_TASKQ */
 
-        QL_DPRINT2(ha, "exit ret = %d\n", ret);
+        QL_DPRINT2(ha, "exit \n");
         return;
 }
 
@@ -611,6 +612,17 @@ qlnx_drain_fp_taskqueues(qlnx_host_t *ha)
        return;
 }
 
+static void
+qlnx_get_params(qlnx_host_t *ha)
+{
+       if ((qlnxe_queue_count < 0) || (qlnxe_queue_count > QLNX_MAX_RSS)) {
+               device_printf(ha->pci_dev, "invalid queue_count value (%d)\n",
+                       qlnxe_queue_count);
+               qlnxe_queue_count = 0;
+       }
+       return;
+}
+
 /*
  * Name:       qlnx_pci_attach
  * Function:   attaches the device to the operating system
@@ -706,10 +718,21 @@ qlnx_pci_attach(device_t dev)
        if (qlnx_init_hw(ha) != 0)
                goto qlnx_pci_attach_err;
 
+       qlnx_get_params(ha);
+
+       if((pci_get_device(dev) == QLOGIC_PCI_DEVICE_ID_1644) &&
+               (qlnxe_queue_count == QLNX_DEFAULT_RSS)) {
+               qlnxe_queue_count = QLNX_MAX_RSS;
+       }
+
        /*
         * Allocate MSI-x vectors
         */
-       ha->num_rss = QLNX_MAX_RSS;
+       if(qlnxe_queue_count == 0)
+               ha->num_rss = QLNX_DEFAULT_RSS;
+        else
+               ha->num_rss = qlnxe_queue_count;
+
        ha->num_tc = QLNX_MAX_TC;
 
         ha->msix_count = pci_msix_count(dev);
@@ -1236,6 +1259,44 @@ qlnx_add_fp_stats_sysctls(qlnx_host_t *ha)
                        CTLFLAG_RD, &ha->fp_array[i].tx_pkts_completed,
                        "No. of transmit completions");
 
+                SYSCTL_ADD_QUAD(ctx, node_children,
+                        OID_AUTO, "tx_non_tso_pkts",
+                        CTLFLAG_RD, &ha->fp_array[i].tx_non_tso_pkts,
+                        "No. of non LSO transmited packets");
+
+#ifdef QLNX_TRACE_PERF_DATA
+
+                SYSCTL_ADD_QUAD(ctx, node_children,
+                        OID_AUTO, "tx_pkts_trans_ctx",
+                        CTLFLAG_RD, &ha->fp_array[i].tx_pkts_trans_ctx,
+                        "No. of transmitted packets in transmit context");
+
+                SYSCTL_ADD_QUAD(ctx, node_children,
+                        OID_AUTO, "tx_pkts_compl_ctx",
+                        CTLFLAG_RD, &ha->fp_array[i].tx_pkts_compl_ctx,
+                        "No. of transmit completions in transmit context");
+
+                SYSCTL_ADD_QUAD(ctx, node_children,
+                        OID_AUTO, "tx_pkts_trans_fp",
+                        CTLFLAG_RD, &ha->fp_array[i].tx_pkts_trans_fp,
+                        "No. of transmitted packets in taskqueue");
+
+                SYSCTL_ADD_QUAD(ctx, node_children,
+                        OID_AUTO, "tx_pkts_compl_fp",
+                        CTLFLAG_RD, &ha->fp_array[i].tx_pkts_compl_fp,
+                        "No. of transmit completions in taskqueue");
+
+                SYSCTL_ADD_QUAD(ctx, node_children,
+                        OID_AUTO, "tx_pkts_compl_intr",
+                        CTLFLAG_RD, &ha->fp_array[i].tx_pkts_compl_intr,
+                        "No. of transmit completions in interrupt ctx");
+#endif
+
+                SYSCTL_ADD_QUAD(ctx, node_children,
+                        OID_AUTO, "tx_tso_pkts",
+                        CTLFLAG_RD, &ha->fp_array[i].tx_tso_pkts,
+                        "No. of LSO transmited packets");
+
                SYSCTL_ADD_QUAD(ctx, node_children,
                        OID_AUTO, "tx_lso_wnd_min_len",
                        CTLFLAG_RD, &ha->fp_array[i].tx_lso_wnd_min_len,
@@ -1284,6 +1345,39 @@ qlnx_add_fp_stats_sysctls(qlnx_host_t *ha)
                                &ha->fp_array[i].tx_pkts[j], name_str);
                }
 
+#ifdef QLNX_TRACE_PERF_DATA
+                for (j = 0; j < 18; j++) {
+
+                        bzero(name_str, (sizeof(uint8_t) * sizeof(name_str)));
+                        snprintf(name_str, sizeof(name_str),
+                                "tx_pkts_hist_%02d", (j+1));
+
+                        SYSCTL_ADD_QUAD(ctx, node_children,
+                                OID_AUTO, name_str, CTLFLAG_RD,
+                                &ha->fp_array[i].tx_pkts_hist[j], name_str);
+                }
+                for (j = 0; j < 5; j++) {
+
+                        bzero(name_str, (sizeof(uint8_t) * sizeof(name_str)));
+                        snprintf(name_str, sizeof(name_str),
+                                "tx_comInt_%02d", (j+1));
+
+                        SYSCTL_ADD_QUAD(ctx, node_children,
+                                OID_AUTO, name_str, CTLFLAG_RD,
+                                &ha->fp_array[i].tx_comInt[j], name_str);
+                }
+                for (j = 0; j < 18; j++) {
+
+                        bzero(name_str, (sizeof(uint8_t) * sizeof(name_str)));
+                        snprintf(name_str, sizeof(name_str),
+                                "tx_pkts_q_%02d", (j+1));
+
+                        SYSCTL_ADD_QUAD(ctx, node_children,
+                                OID_AUTO, name_str, CTLFLAG_RD,
+                                &ha->fp_array[i].tx_pkts_q[j], name_str);
+                }
+#endif
+
                SYSCTL_ADD_QUAD(ctx, node_children,
                        OID_AUTO, "err_tx_nsegs_gt_elem_left",
                        CTLFLAG_RD, &ha->fp_array[i].err_tx_nsegs_gt_elem_left,
@@ -1979,6 +2073,12 @@ qlnx_init_ifnet(device_t dev, qlnx_host_t *ha)
        ifp->if_capabilities |= IFCAP_TSO6;
        ifp->if_capabilities |= IFCAP_LRO;
 
+       ifp->if_hw_tsomax =  QLNX_MAX_TSO_FRAME_SIZE -
+                               (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
+       ifp->if_hw_tsomaxsegcount = QLNX_MAX_SEGMENTS - 1 /* hdr */;
+       ifp->if_hw_tsomaxsegsize = QLNX_MAX_TX_MBUF_SIZE;
+
+
         ifp->if_capenable = ifp->if_capabilities;
 
        ifp->if_hwassist = CSUM_IP;
@@ -2543,6 +2643,7 @@ qlnx_tx_int(qlnx_host_t *ha, struct qlnx_fastpath *fp,
        u16 hw_bd_cons;
        u16 ecore_cons_idx;
        uint16_t diff;
+       uint16_t idx, idx2;
 
        hw_bd_cons = le16toh(*txq->hw_cons_ptr);
 
@@ -2580,6 +2681,11 @@ qlnx_tx_int(qlnx_host_t *ha, struct qlnx_fastpath *fp,
                        qlnx_trigger_dump(ha);
                }
 
+               idx = (txq->sw_tx_cons + 1) & (TX_RING_SIZE - 1);
+               idx2 = (txq->sw_tx_cons + 2) & (TX_RING_SIZE - 1);
+               prefetch(txq->sw_tx_ring[idx].mp);
+               prefetch(txq->sw_tx_ring[idx2].mp);
+
                qlnx_free_tx_pkt(ha, fp, txq);
 
                txq->sw_tx_cons = (txq->sw_tx_cons + 1) & (TX_RING_SIZE - 1);
@@ -2588,12 +2694,71 @@ qlnx_tx_int(qlnx_host_t *ha, struct qlnx_fastpath *fp,
 }
 
 static int
+qlnx_transmit_locked(struct ifnet *ifp,struct qlnx_fastpath  *fp, struct mbuf  
*mp)
+{
+        int                     ret = 0;
+        struct qlnx_tx_queue    *txq;
+        qlnx_host_t *           ha;
+        uint16_t elem_left;
+
+        txq = fp->txq[0];
+        ha = (qlnx_host_t *)fp->edev;
+
+
+        if ((!(ifp->if_drv_flags & IFF_DRV_RUNNING)) || (!ha->link_up)) {
+                if(mp != NULL)
+                        ret = drbr_enqueue(ifp, fp->tx_br, mp);
+                return (ret);
+        }
+
+        if(mp != NULL)
+                ret  = drbr_enqueue(ifp, fp->tx_br, mp);
+
+        mp = drbr_peek(ifp, fp->tx_br);
+
+        while (mp != NULL) {
+
+                if (qlnx_send(ha, fp, &mp)) {
+
+                        if (mp != NULL) {
+                                drbr_putback(ifp, fp->tx_br, mp);
+                        } else {
+                                fp->tx_pkts_processed++;
+                                drbr_advance(ifp, fp->tx_br);
+                        }
+                        goto qlnx_transmit_locked_exit;
+
+                } else {
+                        drbr_advance(ifp, fp->tx_br);
+                        fp->tx_pkts_transmitted++;
+                        fp->tx_pkts_processed++;
+                }
+
+                mp = drbr_peek(ifp, fp->tx_br);
+        }
+
+qlnx_transmit_locked_exit:
+        if((qlnx_num_tx_compl(ha,fp, fp->txq[0]) > QLNX_TX_COMPL_THRESH) ||
+                ((int)(elem_left = ecore_chain_get_elem_left(&txq->tx_pbl))
+                                        < QLNX_TX_ELEM_MAX_THRESH))
+                (void)qlnx_tx_int(ha, fp, fp->txq[0]);
+
+        QL_DPRINT2(ha, "%s: exit ret = %d\n", __func__, ret);
+        return ret;
+}
+
+
+static int
 qlnx_transmit(struct ifnet *ifp, struct mbuf  *mp)
 {
         qlnx_host_t            *ha = (qlnx_host_t *)ifp->if_softc;
         struct qlnx_fastpath   *fp;
         int                    rss_id = 0, ret = 0;
 
+#ifdef QLNX_TRACEPERF_DATA
+        uint64_t tx_pkts = 0, tx_compl = 0;
+#endif
+
         QL_DPRINT2(ha, "enter\n");
 
 #if __FreeBSD_version >= 1100000
@@ -2611,15 +2776,27 @@ qlnx_transmit(struct ifnet *ifp, struct mbuf  *mp)
                 goto qlnx_transmit_exit;
         }
 
-        if (mp != NULL) {
-                ret = drbr_enqueue(ifp, fp->tx_br, mp);
-        }
+        if (mtx_trylock(&fp->tx_mtx)) {
 
-        if (fp->fp_taskqueue != NULL)
-                taskqueue_enqueue(fp->fp_taskqueue, &fp->fp_task);
+#ifdef QLNX_TRACEPERF_DATA
+                        tx_pkts = fp->tx_pkts_transmitted;
+                        tx_compl = fp->tx_pkts_completed;
+#endif
 
-        ret = 0;
+                        ret = qlnx_transmit_locked(ifp, fp, mp);
 
+#ifdef QLNX_TRACEPERF_DATA
+                        fp->tx_pkts_trans_ctx += (fp->tx_pkts_transmitted - 
tx_pkts);
+                        fp->tx_pkts_compl_ctx += (fp->tx_pkts_completed - 
tx_compl);
+#endif
+                        mtx_unlock(&fp->tx_mtx);
+        } else {
+                if (mp != NULL && (fp->fp_taskqueue != NULL)) {
+                        ret = drbr_enqueue(ifp, fp->tx_br, mp);
+                        taskqueue_enqueue(fp->fp_taskqueue, &fp->fp_task);
+                }
+        }
+
 qlnx_transmit_exit:
 
         QL_DPRINT2(ha, "exit ret = %d\n", ret);
@@ -2799,6 +2976,10 @@ qlnx_send(qlnx_host_t *ha, struct qlnx_fastpath *fp, s
        uint32_t                nbds_in_hdr = 0;
        uint32_t                offset = 0;
 
+#ifdef QLNX_TRACE_PERF_DATA
+        uint16_t                bd_used;
+#endif
+
        QL_DPRINT8(ha, "enter\n");
 
        if (!ha->link_up)
@@ -2811,15 +2992,15 @@ qlnx_send(qlnx_host_t *ha, struct qlnx_fastpath *fp, s
 
        txq = fp->txq[0];
 
-       if (fp->tx_ring_full) {
-               elem_left = ecore_chain_get_elem_left(&txq->tx_pbl);
+        if ((int)(elem_left = ecore_chain_get_elem_left(&txq->tx_pbl)) <
+               QLNX_TX_ELEM_MIN_THRESH) {
 
-               if (elem_left < (TX_RING_SIZE >> 4)) 
-                       return (-1);
-               else 
-                       fp->tx_ring_full = 0;
-       }
+                fp->tx_nsegs_gt_elem_left++;
+                fp->err_tx_nsegs_gt_elem_left++;
 
+                return (ENOBUFS);
+        }
+
        idx = txq->sw_tx_prod;
 
        map = txq->sw_tx_ring[idx].map;
@@ -2829,14 +3010,18 @@ qlnx_send(qlnx_host_t *ha, struct qlnx_fastpath *fp, s
                        BUS_DMA_NOWAIT);
 
        if (ha->dbg_trace_tso_pkt_len) {
-               if (!fp->tx_tso_min_pkt_len) {
-                       fp->tx_tso_min_pkt_len = m_head->m_pkthdr.len;
-                       fp->tx_tso_min_pkt_len = m_head->m_pkthdr.len;
-               } else {
-                       if (fp->tx_tso_min_pkt_len > m_head->m_pkthdr.len)
+               if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
+                       if (!fp->tx_tso_min_pkt_len) {
                                fp->tx_tso_min_pkt_len = m_head->m_pkthdr.len;
-                       if (fp->tx_tso_max_pkt_len < m_head->m_pkthdr.len)
-                               fp->tx_tso_max_pkt_len = m_head->m_pkthdr.len;
+                               fp->tx_tso_min_pkt_len = m_head->m_pkthdr.len;
+                       } else {
+                               if (fp->tx_tso_min_pkt_len > 
m_head->m_pkthdr.len)
+                                       fp->tx_tso_min_pkt_len =
+                                               m_head->m_pkthdr.len;
+                               if (fp->tx_tso_max_pkt_len < 
m_head->m_pkthdr.len)
+                                       fp->tx_tso_max_pkt_len =
+                                               m_head->m_pkthdr.len;
+                       }
                }
        }
 
@@ -2923,6 +3108,105 @@ qlnx_send(qlnx_host_t *ha, struct qlnx_fastpath *fp, s
                        fp->tx_pkts[(QLNX_FP_MAX_SEGS - 1)]++; 
        }
 
+#ifdef QLNX_TRACE_PERF_DATA
+        if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
+                if(m_head->m_pkthdr.len <= 2048)
+                        fp->tx_pkts_hist[0]++;
+                else if((m_head->m_pkthdr.len > 2048) &&
+                               (m_head->m_pkthdr.len <= 4096))
+                        fp->tx_pkts_hist[1]++;
+                else if((m_head->m_pkthdr.len > 4096) &&
+                               (m_head->m_pkthdr.len <= 8192))
+                        fp->tx_pkts_hist[2]++;
+                else if((m_head->m_pkthdr.len > 8192) &&
+                               (m_head->m_pkthdr.len <= 12288 ))
+                        fp->tx_pkts_hist[3]++;
+                else if((m_head->m_pkthdr.len > 11288) &&
+                               (m_head->m_pkthdr.len <= 16394))
+                        fp->tx_pkts_hist[4]++;
+                else if((m_head->m_pkthdr.len > 16384) &&
+                               (m_head->m_pkthdr.len <= 20480))
+                        fp->tx_pkts_hist[5]++;
+                else if((m_head->m_pkthdr.len > 20480) &&
+                               (m_head->m_pkthdr.len <= 24576))
+                        fp->tx_pkts_hist[6]++;
+                else if((m_head->m_pkthdr.len > 24576) &&
+                               (m_head->m_pkthdr.len <= 28672))
+                        fp->tx_pkts_hist[7]++;
+                else if((m_head->m_pkthdr.len > 28762) &&
+                               (m_head->m_pkthdr.len <= 32768))
+                        fp->tx_pkts_hist[8]++;
+                else if((m_head->m_pkthdr.len > 32768) &&
+                               (m_head->m_pkthdr.len <= 36864))
+                        fp->tx_pkts_hist[9]++;
+                else if((m_head->m_pkthdr.len > 36864) &&
+                               (m_head->m_pkthdr.len <= 40960))
+                        fp->tx_pkts_hist[10]++;
+                else if((m_head->m_pkthdr.len > 40960) &&
+                               (m_head->m_pkthdr.len <= 45056))
+                        fp->tx_pkts_hist[11]++;
+                else if((m_head->m_pkthdr.len > 45056) &&
+                               (m_head->m_pkthdr.len <= 49152))
+                        fp->tx_pkts_hist[12]++;
+                else if((m_head->m_pkthdr.len > 49512) && 
+                               m_head->m_pkthdr.len <= 53248))
+                        fp->tx_pkts_hist[13]++;
+                else if((m_head->m_pkthdr.len > 53248) &&
+                               (m_head->m_pkthdr.len <= 57344))
+                        fp->tx_pkts_hist[14]++;
+                else if((m_head->m_pkthdr.len > 53248) &&
+                               (m_head->m_pkthdr.len <= 57344))
+                        fp->tx_pkts_hist[15]++;
+                else if((m_head->m_pkthdr.len > 57344) &&
+                               (m_head->m_pkthdr.len <= 61440))
+                        fp->tx_pkts_hist[16]++;
+                else
+                        fp->tx_pkts_hist[17]++;
+        }
+
+        if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
+
+                elem_left =  ecore_chain_get_elem_left(&txq->tx_pbl);
+                bd_used = TX_RING_SIZE - elem_left;
+
+                if(bd_used <= 100)
+                        fp->tx_pkts_q[0]++;
+                else if((bd_used > 100) && (bd_used <= 500))
+                        fp->tx_pkts_q[1]++;
+                else if((bd_used > 500) && (bd_used <= 1000))
+                        fp->tx_pkts_q[2]++;
+                else if((bd_used > 1000) && (bd_used <= 2000))
+                        fp->tx_pkts_q[3]++;
+                else if((bd_used > 3000) && (bd_used <= 4000))
+                        fp->tx_pkts_q[4]++;
+                else if((bd_used > 4000) && (bd_used <= 5000))
+                        fp->tx_pkts_q[5]++;
+                else if((bd_used > 6000) && (bd_used <= 7000))
+                        fp->tx_pkts_q[6]++;
+                else if((bd_used > 7000) && (bd_used <= 8000))
+                        fp->tx_pkts_q[7]++;
+                else if((bd_used > 8000) && (bd_used <= 9000))
+                        fp->tx_pkts_q[8]++;
+                else if((bd_used > 9000) && (bd_used <= 10000))
+                        fp->tx_pkts_q[9]++;
+                else if((bd_used > 10000) && (bd_used <= 11000))
+                        fp->tx_pkts_q[10]++;
+                else if((bd_used > 11000) && (bd_used <= 12000))
+                        fp->tx_pkts_q[11]++;
+                else if((bd_used > 12000) && (bd_used <= 13000))
+                        fp->tx_pkts_q[12]++;
+                else if((bd_used > 13000) && (bd_used <= 14000))
+                        fp->tx_pkts_q[13]++;
+                else if((bd_used > 14000) && (bd_used <= 15000))
+                        fp->tx_pkts_q[14]++;
+               else if((bd_used > 15000) && (bd_used <= 16000))
+                        fp->tx_pkts_q[15]++;
+                else
+                        fp->tx_pkts_q[16]++;
+        }
+
+#endif /* end of QLNX_TRACE_PERF_DATA */
+
        if ((nsegs + QLNX_TX_ELEM_RESERVE) >
                (int)(elem_left = ecore_chain_get_elem_left(&txq->tx_pbl))) {
 
@@ -2943,7 +3227,8 @@ qlnx_send(qlnx_host_t *ha, struct qlnx_fastpath *fp, s
 
                        fp->err_tx_nsegs_gt_elem_left++;
                        fp->tx_ring_full = 1;
-                       ha->storm_stats_enable = 1;
+                       if (ha->storm_stats_enable)
+                               ha->storm_stats_gather = 1;
                        return (ENOBUFS);
                }
        }
@@ -3131,6 +3416,7 @@ qlnx_send(qlnx_host_t *ha, struct qlnx_fastpath *fp, s
                        third_bd->data.bitfields |=
                                (nbds_in_hdr<<ETH_TX_DATA_3RD_BD_HDR_NBD_SHIFT);
                }
+               fp->tx_tso_pkts++;
        } else {
                segs++;
                for (seg_idx = 1; seg_idx < nsegs; seg_idx++) {
@@ -3147,6 +3433,7 @@ qlnx_send(qlnx_host_t *ha, struct qlnx_fastpath *fp, s
                                 << ETH_TX_DATA_1ST_BD_PKT_LEN_SHIFT;
                first_bd->data.bitfields =
                        htole16(first_bd->data.bitfields);
+               fp->tx_non_tso_pkts++;
        }
 
 
@@ -4303,8 +4590,10 @@ qlnx_fp_isr(void *arg)
                if (fp->fp_taskqueue != NULL)
                        taskqueue_enqueue(fp->fp_taskqueue, &fp->fp_task);
 #else
-               int     rx_int = 0, total_rx_count = 0;
-               int     lro_enable, tc;
+               int                     rx_int = 0, total_rx_count = 0;
+               int                     lro_enable, tc;
+               struct qlnx_tx_queue    *txq;
+               uint16_t                elem_left;
 
                lro_enable = ha->ifp->if_capenable & IFCAP_LRO;
 
@@ -4312,10 +4601,36 @@ qlnx_fp_isr(void *arg)
 
                 do {
                         for (tc = 0; tc < ha->num_tc; tc++) {
-                                if (mtx_trylock(&fp->tx_mtx)) {
-                                        qlnx_tx_int(ha, fp, fp->txq[tc]);
-                                        mtx_unlock(&fp->tx_mtx);
-                                }
+
+                               txq = fp->txq[tc];
+
+                               if((int)(elem_left =
+                                       
ecore_chain_get_elem_left(&txq->tx_pbl)) <
+                                               QLNX_TX_ELEM_THRESH)  {
+
+                                       if (mtx_trylock(&fp->tx_mtx)) {
+#ifdef QLNX_TRACE_PERF_DATA
+                                               tx_compl = 
fp->tx_pkts_completed;
+#endif
+
+                                               qlnx_tx_int(ha, fp, 
fp->txq[tc]);
+#ifdef QLNX_TRACE_PERF_DATA
+                                               fp->tx_pkts_compl_intr +=
+                                                       (fp->tx_pkts_completed 
- tx_compl);
+                                               if ((fp->tx_pkts_completed - 
tx_compl) <= 32)
+                                                       fp->tx_comInt[0]++;
+                                               else if 
(((fp->tx_pkts_completed - tx_compl) > 32) &&
+                                                       ((fp->tx_pkts_completed 
- tx_compl) <= 64))
+                                                       fp->tx_comInt[1]++;
+                                               else if(((fp->tx_pkts_completed 
- tx_compl) > 64) &&
+                                                       ((fp->tx_pkts_completed 
- tx_compl) <= 128))
+                                                       fp->tx_comInt[2]++;
+                                               else if(((fp->tx_pkts_completed 
- tx_compl) > 128))
+                                                       fp->tx_comInt[3]++;
+#endif
+                                               mtx_unlock(&fp->tx_mtx);
+                                       }
+                               }
                         }
 
                         rx_int = qlnx_rx_int(ha, fp, ha->rx_pkt_threshold,
@@ -4328,7 +4643,6 @@ qlnx_fp_isr(void *arg)
 
                 } while (rx_int);
 
-
 #ifdef QLNX_SOFT_LRO
                 {
                         struct lro_ctrl *lro;
@@ -4608,8 +4922,8 @@ qlnx_alloc_tx_dma_tag(qlnx_host_t *ha)
                 NULL, NULL,      /* filter, filterarg */
                 QLNX_MAX_TSO_FRAME_SIZE,     /* maxsize */
                 QLNX_MAX_SEGMENTS,        /* nsegments */
-                (PAGE_SIZE * 4),        /* maxsegsize */
-                BUS_DMA_ALLOCNOW,        /* flags */
+                QLNX_MAX_TX_MBUF_SIZE,   /* maxsegsize */
+                0,        /* flags */
                 NULL,    /* lockfunc */
                 NULL,    /* lockfuncarg */
                 &ha->tx_tag)) {
@@ -4642,7 +4956,7 @@ qlnx_alloc_rx_dma_tag(qlnx_host_t *ha)
                         MJUM9BYTES,     /* maxsize */
                         1,        /* nsegments */
                         MJUM9BYTES,        /* maxsegsize */
-                        BUS_DMA_ALLOCNOW,        /* flags */
+                        0,        /* flags */
                         NULL,    /* lockfunc */
                         NULL,    /* lockfuncarg */
                         &ha->rx_tag)) {
@@ -5255,6 +5569,14 @@ qlnx_init_fp(qlnx_host_t *ha)
                fp->tx_pkts_freed = 0;
                fp->tx_pkts_transmitted = 0;
                fp->tx_pkts_completed = 0;
+
+#ifdef QLNX_TRACE_PERF_DATA
+               fp->tx_pkts_trans_ctx = 0;
+               fp->tx_pkts_compl_ctx = 0;
+               fp->tx_pkts_trans_fp = 0;
+               fp->tx_pkts_compl_fp = 0;
+               fp->tx_pkts_compl_intr = 0;
+#endif
                fp->tx_lso_wnd_min_len = 0;
                fp->tx_defrag = 0;
                fp->tx_nsegs_gt_elem_left = 0;
@@ -6606,7 +6928,7 @@ qlnx_timer(void *arg)
 
                ecore_get_vport_stats(&ha->cdev, &ha->hw_stats);
 
-       if (ha->storm_stats_enable)
+       if (ha->storm_stats_gather)
                qlnx_sample_storm_stats(ha);
 
        callout_reset(&ha->qlnx_callout, hz, qlnx_timer, ha);
@@ -6855,7 +7177,7 @@ qlnx_sample_storm_stats(qlnx_host_t *ha)
         struct ecore_hwfn      *hwfn;
 
        if (ha->storm_stats_index >= QLNX_STORM_STATS_SAMPLES_PER_HWFN) {
-               ha->storm_stats_enable = 0;
+               ha->storm_stats_gather = 0;
                return;
        }
 

Modified: stable/11/sys/dev/qlnx/qlnxe/qlnx_ver.h
==============================================================================
--- stable/11/sys/dev/qlnx/qlnxe/qlnx_ver.h     Thu Aug 24 18:01:17 2017        
(r322850)
+++ stable/11/sys/dev/qlnx/qlnxe/qlnx_ver.h     Thu Aug 24 18:51:55 2017        
(r322851)
@@ -39,5 +39,5 @@
 
 #define QLNX_VERSION_MAJOR      1
 #define QLNX_VERSION_MINOR      4
-#define QLNX_VERSION_BUILD      6
+#define QLNX_VERSION_BUILD      7
 

Modified: stable/11/sys/modules/qlnx/qlnxe/Makefile
==============================================================================
--- stable/11/sys/modules/qlnx/qlnxe/Makefile   Thu Aug 24 18:01:17 2017        
(r322850)
+++ stable/11/sys/modules/qlnx/qlnxe/Makefile   Thu Aug 24 18:51:55 2017        
(r322851)
@@ -52,7 +52,7 @@ SRCS+= pci_if.h
 
 CWARNEXTRA += -Wno-cast-qual
 
-CFLAGS += -DQLNX_DEBUG
+#CFLAGS += -DQLNX_DEBUG
 CFLAGS += -DECORE_PACKAGE
 CFLAGS += -DCONFIG_ECORE_L2
 CFLAGS += -DECORE_CONFIG_DIRECT_HWFN
_______________________________________________
svn-src-all@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"

Reply via email to