mpi's recent post about vlan performance which happened to use ix
reminded me that i had some diffz for ix that may be relevant to
the discussion.

this uses the loads and stores of the produce and consumer indexes
to calculate free space in the start path and for figuring out how
much space to reclaim in txeof. this is instead coordining with an
actual "available slots" counter using atomic ops.

it also avoids doing a bunch of work in the txeof path. instead of
putting the mbuf on the last descriptor for a packet, we leave it on the
first and use the index to the last to poll for packet completion. this
means we can leave the mbuf and dmamap in the first slot, which
simplifies the encap code. by only reading the last descriptor from the
ring, we are no longer scrubbing entries like we used to, but this
doesnt seem be a problem in practice.

anyway, i think the big cost when transmitting is writing to the
register at the end of the loop. we effectively call start for every
packet, which means we post each packet to the chip. this is why
tx mitigation helps; it reduces the number of register writes.

dlg

Index: if_ix.c
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_ix.c,v
retrieving revision 1.152
diff -u -p -r1.152 if_ix.c
--- if_ix.c     22 Jun 2017 02:44:37 -0000      1.152
+++ if_ix.c     19 Feb 2019 11:36:43 -0000
@@ -385,6 +385,7 @@ ixgbe_start(struct ifqueue *ifq)
        struct ix_softc         *sc = ifp->if_softc;
        struct tx_ring          *txr = sc->tx_rings;
        struct mbuf             *m_head;
+       unsigned int             head, free, used;
        int                      post = 0;
 
        if (!(ifp->if_flags & IFF_RUNNING) || ifq_is_oactive(ifq))
@@ -392,13 +393,21 @@ ixgbe_start(struct ifqueue *ifq)
        if (!sc->link_up)
                return;
 
-       bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, 0,
-           txr->txdma.dma_map->dm_mapsize,
-           BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
+       head = txr->next_avail_desc;
+       free = txr->next_to_clean;
+       if (free <= head)
+               free += sc->num_tx_desc;
+       free -= head;
+
+       membar_consumer();
+
+       bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map,
+           0, txr->txdma.dma_map->dm_mapsize,
+           BUS_DMASYNC_POSTWRITE);
 
        for (;;) {
                /* Check that we have the minimal number of TX descriptors. */
-               if (txr->tx_avail <= IXGBE_TX_OP_THRESHOLD) {
+               if (free <= IXGBE_TX_OP_THRESHOLD) {
                        ifq_set_oactive(ifq);
                        break;
                }
@@ -407,11 +416,14 @@ ixgbe_start(struct ifqueue *ifq)
                if (m_head == NULL)
                        break;
 
-               if (ixgbe_encap(txr, m_head)) {
+               used = ixgbe_encap(txr, m_head);
+               if (used == 0) {
                        m_freem(m_head);
                        continue;
                }
 
+               free -= used;
+
 #if NBPFILTER > 0
                if (ifp->if_bpf)
                        bpf_mtap_ether(ifp->if_bpf, m_head, BPF_DIRECTION_OUT);
@@ -426,7 +438,7 @@ ixgbe_start(struct ifqueue *ifq)
 
        bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map,
            0, txr->txdma.dma_map->dm_mapsize,
-           BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
+           BUS_DMASYNC_PREWRITE);
 
        /*
         * Advance the Transmit Descriptor Tail (Tdt), this tells the
@@ -579,8 +591,8 @@ ixgbe_watchdog(struct ifnet * ifp)
                printf("%s: Queue(%d) tdh = %d, hw tdt = %d\n", ifp->if_xname, 
i,
                    IXGBE_READ_REG(hw, IXGBE_TDH(i)),
                    IXGBE_READ_REG(hw, IXGBE_TDT(i)));
-               printf("%s: TX(%d) desc avail = %d, Next TX to Clean = %d\n", 
ifp->if_xname,
-                   i, txr->tx_avail, txr->next_to_clean);
+               printf("%s: TX(%d) Next TX to Clean = %d\n", ifp->if_xname,
+                   i, txr->next_to_clean);
        }
        ifp->if_flags &= ~IFF_RUNNING;
        sc->watchdog_events++;
@@ -1150,7 +1162,7 @@ ixgbe_encap(struct tx_ring *txr, struct 
 {
        struct ix_softc *sc = txr->sc;
        uint32_t        olinfo_status = 0, cmd_type_len;
-       int             i, j, error;
+       int             i, j, ntxc;
        int             first, last = 0;
        bus_dmamap_t    map;
        struct ixgbe_tx_buf *txbuf;
@@ -1177,36 +1189,34 @@ ixgbe_encap(struct tx_ring *txr, struct 
        /*
         * Map the packet for DMA.
         */
-       error = bus_dmamap_load_mbuf(txr->txdma.dma_tag, map, m_head,
-           BUS_DMA_NOWAIT);
-       switch (error) {
+       switch (bus_dmamap_load_mbuf(txr->txdma.dma_tag, map,
+           m_head, BUS_DMA_NOWAIT)) {
        case 0:
                break;
        case EFBIG:
                if (m_defrag(m_head, M_NOWAIT) == 0 &&
-                   (error = bus_dmamap_load_mbuf(txr->txdma.dma_tag, map,
-                    m_head, BUS_DMA_NOWAIT)) == 0)
+                   bus_dmamap_load_mbuf(txr->txdma.dma_tag, map,
+                    m_head, BUS_DMA_NOWAIT) == 0)
                        break;
                /* FALLTHROUGH */
        default:
                sc->no_tx_dma_setup++;
-               return (error);
+               return (0);
        }
 
-       /* Make certain there are enough descriptors */
-       KASSERT(map->dm_nsegs <= txr->tx_avail - 2);
-
        /*
         * Set the appropriate offload context
         * this will becomes the first descriptor.
         */
-       error = ixgbe_tx_ctx_setup(txr, m_head, &cmd_type_len, &olinfo_status);
-       if (error)
+       ntxc = ixgbe_tx_ctx_setup(txr, m_head, &cmd_type_len, &olinfo_status);
+       if (ntxc == -1)
                goto xmit_fail;
 
-       i = txr->next_avail_desc;
+       i = txr->next_avail_desc + ntxc;
+       if (i >= sc->num_tx_desc)
+               i -= sc->num_tx_desc;
+
        for (j = 0; j < map->dm_nsegs; j++) {
-               txbuf = &txr->tx_buffers[i];
                txd = &txr->tx_base[i];
 
                txd->read.buffer_addr = htole64(map->dm_segs[j].ds_addr);
@@ -1217,41 +1227,28 @@ ixgbe_encap(struct tx_ring *txr, struct 
 
                if (++i == sc->num_tx_desc)
                        i = 0;
-
-               txbuf->m_head = NULL;
-               txbuf->eop_index = -1;
        }
 
        txd->read.cmd_type_len |=
            htole32(IXGBE_TXD_CMD_EOP | IXGBE_TXD_CMD_RS);
 
-       txbuf->m_head = m_head;
-       /*
-        * Here we swap the map so the last descriptor,
-        * which gets the completion interrupt has the
-        * real map, and the first descriptor gets the
-        * unused map from this descriptor.
-        */
-       txr->tx_buffers[first].map = txbuf->map;
-       txbuf->map = map;
        bus_dmamap_sync(txr->txdma.dma_tag, map, 0, map->dm_mapsize,
            BUS_DMASYNC_PREWRITE);
 
        /* Set the index of the descriptor that will be marked done */
-       txbuf = &txr->tx_buffers[first];
+       txbuf->m_head = m_head;
        txbuf->eop_index = last;
 
        membar_producer();
 
-       atomic_sub_int(&txr->tx_avail, map->dm_nsegs);
        txr->next_avail_desc = i;
 
        ++txr->tx_packets;
-       return (0);
+       return (ntxc + j);
 
 xmit_fail:
        bus_dmamap_unload(txr->txdma.dma_tag, txbuf->map);
-       return (error);
+       return (0);
 }
 
 void
@@ -1977,9 +1974,6 @@ ixgbe_setup_transmit_ring(struct tx_ring
        txr->next_avail_desc = 0;
        txr->next_to_clean = 0;
 
-       /* Set number of descriptors available */
-       txr->tx_avail = sc->num_tx_desc;
-
        bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map,
            0, txr->txdma.dma_map->dm_mapsize,
            BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
@@ -2155,7 +2149,6 @@ int
 ixgbe_tx_ctx_setup(struct tx_ring *txr, struct mbuf *mp,
     uint32_t *cmd_type_len, uint32_t *olinfo_status)
 {
-       struct ix_softc *sc = txr->sc;
        struct ixgbe_adv_tx_context_desc *TXD;
        struct ixgbe_tx_buf *tx_buffer;
 #if NVLAN > 0
@@ -2215,12 +2208,12 @@ ixgbe_tx_ctx_setup(struct tx_ring *txr, 
         * helpful for QinQ too.
         */
        if (mp->m_len < sizeof(struct ether_header))
-               return (1);
+               return (-1);
 #if NVLAN > 0
        eh = mtod(mp, struct ether_vlan_header *);
        if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
                if (mp->m_len < sizeof(struct ether_vlan_header))
-                       return (1);
+                       return (-1);
                etype = ntohs(eh->evl_proto);
                ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
        } else {
@@ -2239,7 +2232,7 @@ ixgbe_tx_ctx_setup(struct tx_ring *txr, 
        switch (etype) {
        case ETHERTYPE_IP:
                if (mp->m_pkthdr.len < ehdrlen + sizeof(*ip))
-                       return (1);
+                       return (-1);
                m = m_getptr(mp, ehdrlen, &ipoff);
                KASSERT(m != NULL && m->m_len - ipoff >= sizeof(*ip));
                ip = (struct ip *)(m->m_data + ipoff);
@@ -2250,7 +2243,7 @@ ixgbe_tx_ctx_setup(struct tx_ring *txr, 
 #ifdef notyet
        case ETHERTYPE_IPV6:
                if (mp->m_pkthdr.len < ehdrlen + sizeof(*ip6))
-                       return (1);
+                       return (-1);
                m = m_getptr(mp, ehdrlen, &ipoff);
                KASSERT(m != NULL && m->m_len - ipoff >= sizeof(*ip6));
                ip6 = (struct ip6 *)(m->m_data + ipoff);
@@ -2294,15 +2287,7 @@ ixgbe_tx_ctx_setup(struct tx_ring *txr, 
        tx_buffer->m_head = NULL;
        tx_buffer->eop_index = -1;
 
-       membar_producer();
-
-       /* We've consumed the first desc, adjust counters */
-       if (++ctxd == sc->num_tx_desc)
-               ctxd = 0;
-       txr->next_avail_desc = ctxd;
-       atomic_dec_int(&txr->tx_avail);
-
-       return (0);
+       return (1);
 }
 
 /**********************************************************************
@@ -2317,98 +2302,58 @@ ixgbe_txeof(struct tx_ring *txr)
 {
        struct ix_softc                 *sc = txr->sc;
        struct ifnet                    *ifp = &sc->arpcom.ac_if;
-       uint32_t                         first, last, done, processed;
-       uint32_t                         num_avail;
+       unsigned int                     head, tail, last;
        struct ixgbe_tx_buf             *tx_buffer;
-       struct ixgbe_legacy_tx_desc *tx_desc, *eop_desc;
+       struct ixgbe_legacy_tx_desc     *tx_desc;
 
        if (!ISSET(ifp->if_flags, IFF_RUNNING))
                return FALSE;
 
-       if (txr->tx_avail == sc->num_tx_desc) {
-               txr->queue_status = IXGBE_QUEUE_IDLE;
-               return FALSE;
-       }
+       head = txr->next_avail_desc;
+       tail = txr->next_to_clean;
 
        membar_consumer();
 
-       processed = 0;
-       first = txr->next_to_clean;
-       /* was the txt queue cleaned up in the meantime */
-       if (txr->tx_buffers == NULL)
-               return FALSE;
-       tx_buffer = &txr->tx_buffers[first];
-       /* For cleanup we just use legacy struct */
-       tx_desc = (struct ixgbe_legacy_tx_desc *)&txr->tx_base[first];
-       last = tx_buffer->eop_index;
-       if (last == -1)
-               return FALSE;
-       eop_desc = (struct ixgbe_legacy_tx_desc *)&txr->tx_base[last];
-
-       /*
-        * Get the index of the first descriptor
-        * BEYOND the EOP and call that 'done'.
-        * I do this so the comparison in the
-        * inner while loop below can be simple
-        */
-       if (++last == sc->num_tx_desc) last = 0;
-       done = last;
+       if (head == tail)
+               return (FALSE);
 
        bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map,
            0, txr->txdma.dma_map->dm_mapsize,
            BUS_DMASYNC_POSTREAD);
 
-       while (eop_desc->upper.fields.status & IXGBE_TXD_STAT_DD) {
-               /* We clean the range of the packet */
-               while (first != done) {
-                       tx_desc->upper.data = 0;
-                       tx_desc->lower.data = 0;
-                       tx_desc->buffer_addr = 0;
-                       ++processed;
-
-                       if (tx_buffer->m_head) {
-                               bus_dmamap_sync(txr->txdma.dma_tag,
-                                   tx_buffer->map,
-                                   0, tx_buffer->map->dm_mapsize,
-                                   BUS_DMASYNC_POSTWRITE);
-                               bus_dmamap_unload(txr->txdma.dma_tag,
-                                   tx_buffer->map);
-                               m_freem(tx_buffer->m_head);
-                               tx_buffer->m_head = NULL;
-                       }
-                       tx_buffer->eop_index = -1;
+       for (;;) {
+               tx_buffer = &txr->tx_buffers[tail];
+               last = tx_buffer->eop_index;
+               tx_desc = (struct ixgbe_legacy_tx_desc *)&txr->tx_base[last];
 
-                       if (++first == sc->num_tx_desc)
-                               first = 0;
+               if (!ISSET(tx_desc->upper.fields.status, IXGBE_TXD_STAT_DD))
+                       break;
 
-                       tx_buffer = &txr->tx_buffers[first];
-                       tx_desc = (struct ixgbe_legacy_tx_desc *)
-                           &txr->tx_base[first];
-               }
-               ++txr->packets;
-               /* See if there is more work now */
-               last = tx_buffer->eop_index;
-               if (last != -1) {
-                       eop_desc =
-                           (struct ixgbe_legacy_tx_desc *)&txr->tx_base[last];
-                       /* Get next done point */
-                       if (++last == sc->num_tx_desc) last = 0;
-                       done = last;
-               } else
+               bus_dmamap_sync(txr->txdma.dma_tag, tx_buffer->map,
+                   0, tx_buffer->map->dm_mapsize, BUS_DMASYNC_POSTWRITE);
+               bus_dmamap_unload(txr->txdma.dma_tag, tx_buffer->map);
+               m_freem(tx_buffer->m_head);
+
+               tx_buffer->m_head = NULL;
+               tx_buffer->eop_index = -1;
+
+               tail = last + 1;
+               if (tail == sc->num_tx_desc)
+                       tail = 0;
+               if (head == tail) {
+                       /* All clean, turn off the timer */
+                       ifp->if_timer = 0;
                        break;
+               }
        }
 
        bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map,
            0, txr->txdma.dma_map->dm_mapsize,
-           BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
-
-       txr->next_to_clean = first;
+           BUS_DMASYNC_PREREAD);
 
-       num_avail = atomic_add_int_nv(&txr->tx_avail, processed);
+       membar_producer();
 
-       /* All clean, turn off the timer */
-       if (num_avail == sc->num_tx_desc)
-               ifp->if_timer = 0;
+       txr->next_to_clean = tail;
 
        if (ifq_is_oactive(&ifp->if_snd))
                ifq_restart(&ifp->if_snd);
Index: if_ix.h
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_ix.h,v
retrieving revision 1.32
diff -u -p -r1.32 if_ix.h
--- if_ix.h     21 Nov 2016 17:21:33 -0000      1.32
+++ if_ix.h     19 Feb 2019 11:36:43 -0000
@@ -169,7 +169,6 @@ struct tx_ring {
        union ixgbe_adv_tx_desc *tx_base;
        struct ixgbe_tx_buf     *tx_buffers;
        struct ixgbe_dma_alloc  txdma;
-       volatile uint32_t       tx_avail;
        uint32_t                next_avail_desc;
        uint32_t                next_to_clean;
        enum {

Reply via email to