This diff makes the tx completion path run without the kernel lock held. With this change, the interrupt handler will not grab the kernel lock under normal circumstances. The diff follows the same approach as dlg@ took with vmx(4).
The diff removes the code that tries to reclaim tx descriptors in if_start() if we have only EM_TX_CLEANUP_THRESHOLD available descriptors left. Running tx_eof() in both the normal transmit path and from the interrupt handler becomes a locking nightmare. Seems to survive some serious stress-testing with tcpbench and ping -f on my x220: em0 at pci0 dev 25 function 0 "Intel 82579LM" rev 0x04: msi, address f0:de:f1:70:87:8a Could use some further testing of course. Index: if_em.c =================================================================== RCS file: /home/cvs/src/sys/dev/pci/if_em.c,v retrieving revision 1.305 diff -u -p -r1.305 if_em.c --- if_em.c 19 Sep 2015 12:48:26 -0000 1.305 +++ if_em.c 28 Sep 2015 20:58:52 -0000 @@ -920,20 +920,15 @@ em_intr(void *arg) if (reg_icr & E1000_ICR_RXO) sc->rx_overruns++; - KERNEL_LOCK(); - /* Link status change */ if (reg_icr & (E1000_ICR_RXSEQ | E1000_ICR_LSC)) { + KERNEL_LOCK(); sc->hw.get_link_status = 1; em_check_for_link(&sc->hw); em_update_link_status(sc); + KERNEL_UNLOCK(); } - if (ifp->if_flags & IFF_RUNNING && !IFQ_IS_EMPTY(&ifp->if_snd)) - em_start(ifp); - - KERNEL_UNLOCK(); - if (refill && em_rxfill(sc)) { /* Advance the Rx Queue #0 "Tail Pointer". */ E1000_WRITE_REG(&sc->hw, RDT, sc->last_rx_desc_filled); @@ -1110,17 +1105,10 @@ em_encap(struct em_softc *sc, struct mbu struct em_buffer *tx_buffer, *tx_buffer_mapped; struct em_tx_desc *current_tx_desc = NULL; - /* - * Force a cleanup if number of TX descriptors - * available hits the threshold - */ - if (sc->num_tx_desc_avail <= EM_TX_CLEANUP_THRESHOLD) { - em_txeof(sc); - /* Now do we at least have a minimal? */ - if (sc->num_tx_desc_avail <= EM_TX_OP_THRESHOLD) { - sc->no_tx_desc_avail1++; - return (ENOBUFS); - } + /* Check that we have least the minimal number of TX descriptors. */ + if (sc->num_tx_desc_avail <= EM_TX_OP_THRESHOLD) { + sc->no_tx_desc_avail1++; + return (ENOBUFS); } if (sc->hw.mac_type == em_82547) { @@ -1224,9 +1212,9 @@ em_encap(struct em_softc *sc, struct mbu sc->next_avail_tx_desc = i; if (sc->pcix_82544) - sc->num_tx_desc_avail -= txd_used; + atomic_sub_int(&sc->num_tx_desc_avail, txd_used); else - sc->num_tx_desc_avail -= map->dm_nsegs; + atomic_sub_int(&sc->num_tx_desc_avail, map->dm_nsegs); #if NVLAN > 0 /* Find out if we are in VLAN mode */ @@ -2393,7 +2381,7 @@ em_transmit_checksum_setup(struct em_sof if (++curr_txd == sc->num_tx_desc) curr_txd = 0; - sc->num_tx_desc_avail--; + atomic_dec_int(&sc->num_tx_desc_avail); sc->next_avail_tx_desc = curr_txd; } @@ -2407,7 +2395,7 @@ em_transmit_checksum_setup(struct em_sof void em_txeof(struct em_softc *sc) { - int first, last, done, num_avail; + int first, last, done, num_avail, free = 0; struct em_buffer *tx_buffer; struct em_tx_desc *tx_desc, *eop_desc; struct ifnet *ifp = &sc->interface_data.ac_if; @@ -2415,9 +2403,6 @@ em_txeof(struct em_softc *sc) if (sc->num_tx_desc_avail == sc->num_tx_desc) return; - KERNEL_LOCK(); - - num_avail = sc->num_tx_desc_avail; first = sc->next_tx_to_clean; tx_desc = &sc->tx_desc_base[first]; tx_buffer = &sc->tx_buffer_area[first]; @@ -2441,7 +2426,7 @@ em_txeof(struct em_softc *sc) while (first != done) { tx_desc->upper.data = 0; tx_desc->lower.data = 0; - num_avail++; + free++; if (tx_buffer->m_head != NULL) { ifp->if_opackets++; @@ -2481,25 +2466,23 @@ em_txeof(struct em_softc *sc) sc->next_tx_to_clean = first; - /* - * If we have enough room, clear IFF_OACTIVE to tell the stack - * that it is OK to send packets. - * If there are no pending descriptors, clear the timeout. Otherwise, - * if some descriptors have been freed, restart the timeout. - */ - if (num_avail > EM_TX_CLEANUP_THRESHOLD) - ifp->if_flags &= ~IFF_OACTIVE; + num_avail = atomic_add_int_nv(&sc->num_tx_desc_avail, free); /* All clean, turn off the timer */ if (num_avail == sc->num_tx_desc) ifp->if_timer = 0; - /* Some cleaned, reset the timer */ - else if (num_avail != sc->num_tx_desc_avail) - ifp->if_timer = EM_TX_TIMEOUT; - - sc->num_tx_desc_avail = num_avail; - KERNEL_UNLOCK(); + /* + * If we have enough room, clear IFF_OACTIVE to tell the stack + * that it is OK to send packets. + */ + if (ISSET(ifp->if_flags, IFF_OACTIVE) && + num_avail > EM_TX_OP_THRESHOLD) { + KERNEL_LOCK(); + CLR(ifp->if_flags, IFF_OACTIVE); + em_start(ifp); + KERNEL_UNLOCK(); + } } /********************************************************************* Index: if_em.h =================================================================== RCS file: /home/cvs/src/sys/dev/pci/if_em.h,v retrieving revision 1.57 diff -u -p -r1.57 if_em.h --- if_em.h 19 Sep 2015 12:48:26 -0000 1.57 +++ if_em.h 28 Sep 2015 21:17:45 -0000 @@ -187,10 +187,9 @@ typedef int boolean_t; #define EM_TX_TIMEOUT 5 /* set to 5 seconds */ /* - * These parameters control when the driver calls the routine to reclaim - * transmit descriptors. + * Thise parameter controls the minimum number of available transmit + * descriptors needed before we attempt transmission of a packet. */ -#define EM_TX_CLEANUP_THRESHOLD (sc->num_tx_desc / 8) #define EM_TX_OP_THRESHOLD (sc->num_tx_desc / 32) /* @@ -356,8 +355,8 @@ struct em_softc { struct em_tx_desc *tx_desc_base; u_int32_t next_avail_tx_desc; u_int32_t next_tx_to_clean; - volatile u_int16_t num_tx_desc_avail; - u_int16_t num_tx_desc; + volatile u_int32_t num_tx_desc_avail; + u_int32_t num_tx_desc; u_int32_t txd_cmd; struct em_buffer *tx_buffer_area; bus_dma_tag_t txtag; /* dma tag for tx */