This diff makes the tx completion path run without the kernel lock
held.  With this change, the interrupt handler will not grab the
kernel lock under normal circumstances.  The diff follows the same
approach as dlg@ took with vmx(4).

The diff removes the code that tries to reclaim tx descriptors in
if_start() if we have only EM_TX_CLEANUP_THRESHOLD available
descriptors left.  Running tx_eof() in both the normal transmit path
and from the interrupt handler becomes a locking nightmare.

Seems to survive some serious stress-testing with tcpbench and ping -f
on my x220:

em0 at pci0 dev 25 function 0 "Intel 82579LM" rev 0x04: msi, address 
f0:de:f1:70:87:8a

Could use some further testing of course.


Index: if_em.c
===================================================================
RCS file: /home/cvs/src/sys/dev/pci/if_em.c,v
retrieving revision 1.305
diff -u -p -r1.305 if_em.c
--- if_em.c     19 Sep 2015 12:48:26 -0000      1.305
+++ if_em.c     28 Sep 2015 20:58:52 -0000
@@ -920,20 +920,15 @@ em_intr(void *arg)
        if (reg_icr & E1000_ICR_RXO)
                sc->rx_overruns++;
 
-       KERNEL_LOCK();
-
        /* Link status change */
        if (reg_icr & (E1000_ICR_RXSEQ | E1000_ICR_LSC)) {
+               KERNEL_LOCK();
                sc->hw.get_link_status = 1;
                em_check_for_link(&sc->hw);
                em_update_link_status(sc);
+               KERNEL_UNLOCK();
        }
 
-       if (ifp->if_flags & IFF_RUNNING && !IFQ_IS_EMPTY(&ifp->if_snd))
-               em_start(ifp);
-
-       KERNEL_UNLOCK();
-
        if (refill && em_rxfill(sc)) {
                /* Advance the Rx Queue #0 "Tail Pointer". */
                E1000_WRITE_REG(&sc->hw, RDT, sc->last_rx_desc_filled);
@@ -1110,17 +1105,10 @@ em_encap(struct em_softc *sc, struct mbu
        struct em_buffer   *tx_buffer, *tx_buffer_mapped;
        struct em_tx_desc *current_tx_desc = NULL;
 
-       /*
-        * Force a cleanup if number of TX descriptors
-        * available hits the threshold
-        */
-       if (sc->num_tx_desc_avail <= EM_TX_CLEANUP_THRESHOLD) {
-               em_txeof(sc);
-               /* Now do we at least have a minimal? */
-               if (sc->num_tx_desc_avail <= EM_TX_OP_THRESHOLD) {
-                       sc->no_tx_desc_avail1++;
-                       return (ENOBUFS);
-               }
+       /* Check that we have least the minimal number of TX descriptors. */
+       if (sc->num_tx_desc_avail <= EM_TX_OP_THRESHOLD) {
+               sc->no_tx_desc_avail1++;
+               return (ENOBUFS);
        }
 
        if (sc->hw.mac_type == em_82547) {
@@ -1224,9 +1212,9 @@ em_encap(struct em_softc *sc, struct mbu
 
        sc->next_avail_tx_desc = i;
        if (sc->pcix_82544)
-               sc->num_tx_desc_avail -= txd_used;
+               atomic_sub_int(&sc->num_tx_desc_avail, txd_used);
        else
-               sc->num_tx_desc_avail -= map->dm_nsegs;
+               atomic_sub_int(&sc->num_tx_desc_avail, map->dm_nsegs);
 
 #if NVLAN > 0
        /* Find out if we are in VLAN mode */
@@ -2393,7 +2381,7 @@ em_transmit_checksum_setup(struct em_sof
        if (++curr_txd == sc->num_tx_desc)
                curr_txd = 0;
 
-       sc->num_tx_desc_avail--;
+       atomic_dec_int(&sc->num_tx_desc_avail);
        sc->next_avail_tx_desc = curr_txd;
 }
 
@@ -2407,7 +2395,7 @@ em_transmit_checksum_setup(struct em_sof
 void
 em_txeof(struct em_softc *sc)
 {
-       int first, last, done, num_avail;
+       int first, last, done, num_avail, free = 0;
        struct em_buffer *tx_buffer;
        struct em_tx_desc   *tx_desc, *eop_desc;
        struct ifnet   *ifp = &sc->interface_data.ac_if;
@@ -2415,9 +2403,6 @@ em_txeof(struct em_softc *sc)
        if (sc->num_tx_desc_avail == sc->num_tx_desc)
                return;
 
-       KERNEL_LOCK();
-
-       num_avail = sc->num_tx_desc_avail;
        first = sc->next_tx_to_clean;
        tx_desc = &sc->tx_desc_base[first];
        tx_buffer = &sc->tx_buffer_area[first];
@@ -2441,7 +2426,7 @@ em_txeof(struct em_softc *sc)
                while (first != done) {
                        tx_desc->upper.data = 0;
                        tx_desc->lower.data = 0;
-                       num_avail++;
+                       free++;
 
                        if (tx_buffer->m_head != NULL) {
                                ifp->if_opackets++;
@@ -2481,25 +2466,23 @@ em_txeof(struct em_softc *sc)
 
        sc->next_tx_to_clean = first;
 
-       /*
-        * If we have enough room, clear IFF_OACTIVE to tell the stack
-        * that it is OK to send packets.
-        * If there are no pending descriptors, clear the timeout. Otherwise,
-        * if some descriptors have been freed, restart the timeout.
-        */
-       if (num_avail > EM_TX_CLEANUP_THRESHOLD)
-               ifp->if_flags &= ~IFF_OACTIVE;
+       num_avail = atomic_add_int_nv(&sc->num_tx_desc_avail, free);
 
        /* All clean, turn off the timer */
        if (num_avail == sc->num_tx_desc)
                ifp->if_timer = 0;
-       /* Some cleaned, reset the timer */
-       else if (num_avail != sc->num_tx_desc_avail)
-               ifp->if_timer = EM_TX_TIMEOUT;
-
-       sc->num_tx_desc_avail = num_avail;
 
-       KERNEL_UNLOCK();
+       /*
+        * If we have enough room, clear IFF_OACTIVE to tell the stack
+        * that it is OK to send packets.
+        */
+       if (ISSET(ifp->if_flags, IFF_OACTIVE) &&
+           num_avail > EM_TX_OP_THRESHOLD) {
+               KERNEL_LOCK();
+               CLR(ifp->if_flags, IFF_OACTIVE);
+               em_start(ifp);
+               KERNEL_UNLOCK();
+       }
 }
 
 /*********************************************************************
Index: if_em.h
===================================================================
RCS file: /home/cvs/src/sys/dev/pci/if_em.h,v
retrieving revision 1.57
diff -u -p -r1.57 if_em.h
--- if_em.h     19 Sep 2015 12:48:26 -0000      1.57
+++ if_em.h     28 Sep 2015 21:17:45 -0000
@@ -187,10 +187,9 @@ typedef int        boolean_t;
 #define EM_TX_TIMEOUT                  5       /* set to 5 seconds */
 
 /*
- * These parameters control when the driver calls the routine to reclaim
- * transmit descriptors.
+ * Thise parameter controls the minimum number of available transmit
+ * descriptors needed before we attempt transmission of a packet.
  */
-#define EM_TX_CLEANUP_THRESHOLD                (sc->num_tx_desc / 8)
 #define EM_TX_OP_THRESHOLD             (sc->num_tx_desc / 32)
 
 /*
@@ -356,8 +355,8 @@ struct em_softc {
        struct em_tx_desc       *tx_desc_base;
        u_int32_t               next_avail_tx_desc;
        u_int32_t               next_tx_to_clean;
-       volatile u_int16_t      num_tx_desc_avail;
-       u_int16_t               num_tx_desc;
+       volatile u_int32_t      num_tx_desc_avail;
+       u_int32_t               num_tx_desc;
        u_int32_t               txd_cmd;
        struct em_buffer        *tx_buffer_area;
        bus_dma_tag_t           txtag;          /* dma tag for tx */

Reply via email to