So Mark and I spent some time to figure out what the issue was with ix(4) based on that info I resurected the em(4) mpsafe diff that got backed out and I applied the same fix. It is somewhat unclear if this fixes the watchdog timeouts since in theory the wdog timer should be stopped when hitting the race condition we hit in ix(4).
I'm currently hammering my test system with this and until now I have not seen a watchdog fired. -- :wq Claudio Index: pci/if_em.c =================================================================== RCS file: /cvs/src/sys/dev/pci/if_em.c,v retrieving revision 1.313 diff -u -p -r1.313 if_em.c --- pci/if_em.c 25 Nov 2015 03:09:59 -0000 1.313 +++ pci/if_em.c 3 Dec 2015 22:08:54 -0000 @@ -612,6 +612,16 @@ em_start(struct ifnet *ifp) if (em_encap(sc, m_head)) { ifq_deq_rollback(&ifp->if_snd, m_head); ifq_set_oactive(&ifp->if_snd); + /* + * Make sure there are still packets on the + * ring. The interrupt handler may have + * cleaned up the ring before we were able to + * set the IF_OACTIVE flag. + */ + if (sc->num_tx_desc_avail == sc->num_tx_desc) { + ifq_clr_oactive(&ifp->if_snd); + continue; + } break; } @@ -918,20 +928,17 @@ em_intr(void *arg) if (reg_icr & E1000_ICR_RXO) sc->rx_overruns++; - KERNEL_LOCK(); - /* Link status change */ if (reg_icr & (E1000_ICR_RXSEQ | E1000_ICR_LSC)) { + KERNEL_LOCK(); sc->hw.get_link_status = 1; em_check_for_link(&sc->hw); em_update_link_status(sc); + if (!IFQ_IS_EMPTY(&ifp->if_snd)) + em_start(ifp); + KERNEL_UNLOCK(); } - if (ifp->if_flags & IFF_RUNNING && !IFQ_IS_EMPTY(&ifp->if_snd)) - em_start(ifp); - - KERNEL_UNLOCK(); - if (refill && em_rxfill(sc)) { /* Advance the Rx Queue #0 "Tail Pointer". */ E1000_WRITE_REG(&sc->hw, RDT, sc->last_rx_desc_filled); @@ -1108,17 +1115,10 @@ em_encap(struct em_softc *sc, struct mbu struct em_buffer *tx_buffer, *tx_buffer_mapped; struct em_tx_desc *current_tx_desc = NULL; - /* - * Force a cleanup if number of TX descriptors - * available hits the threshold - */ - if (sc->num_tx_desc_avail <= EM_TX_CLEANUP_THRESHOLD) { - em_txeof(sc); - /* Now do we at least have a minimal? */ - if (sc->num_tx_desc_avail <= EM_TX_OP_THRESHOLD) { - sc->no_tx_desc_avail1++; - return (ENOBUFS); - } + /* Check that we have least the minimal number of TX descriptors. */ + if (sc->num_tx_desc_avail <= EM_TX_OP_THRESHOLD) { + sc->no_tx_desc_avail1++; + return (ENOBUFS); } if (sc->hw.mac_type == em_82547) { @@ -1220,12 +1220,6 @@ em_encap(struct em_softc *sc, struct mbu } } - sc->next_avail_tx_desc = i; - if (sc->pcix_82544) - sc->num_tx_desc_avail -= txd_used; - else - sc->num_tx_desc_avail -= map->dm_nsegs; - #if NVLAN > 0 /* Find out if we are in VLAN mode */ if (m_head->m_flags & M_VLANTAG) { @@ -1259,6 +1253,14 @@ em_encap(struct em_softc *sc, struct mbu tx_buffer = &sc->tx_buffer_area[first]; tx_buffer->next_eop = last; + membar_producer(); + + sc->next_avail_tx_desc = i; + if (sc->pcix_82544) + atomic_sub_int(&sc->num_tx_desc_avail, txd_used); + else + atomic_sub_int(&sc->num_tx_desc_avail, map->dm_nsegs); + /* * Advance the Transmit Descriptor Tail (Tdt), * this tells the E1000 that this frame is @@ -2389,10 +2391,12 @@ em_transmit_checksum_setup(struct em_sof tx_buffer->m_head = NULL; tx_buffer->next_eop = -1; + membar_producer(); + if (++curr_txd == sc->num_tx_desc) curr_txd = 0; - sc->num_tx_desc_avail--; + atomic_dec_int(&sc->num_tx_desc_avail); sc->next_avail_tx_desc = curr_txd; } @@ -2406,7 +2410,7 @@ em_transmit_checksum_setup(struct em_sof void em_txeof(struct em_softc *sc) { - int first, last, done, num_avail; + int first, last, done, num_avail, free = 0; struct em_buffer *tx_buffer; struct em_tx_desc *tx_desc, *eop_desc; struct ifnet *ifp = &sc->interface_data.ac_if; @@ -2414,9 +2418,8 @@ em_txeof(struct em_softc *sc) if (sc->num_tx_desc_avail == sc->num_tx_desc) return; - KERNEL_LOCK(); + membar_consumer(); - num_avail = sc->num_tx_desc_avail; first = sc->next_tx_to_clean; tx_desc = &sc->tx_desc_base[first]; tx_buffer = &sc->tx_buffer_area[first]; @@ -2440,7 +2443,7 @@ em_txeof(struct em_softc *sc) while (first != done) { tx_desc->upper.data = 0; tx_desc->lower.data = 0; - num_avail++; + free++; if (tx_buffer->m_head != NULL) { ifp->if_opackets++; @@ -2480,25 +2483,22 @@ em_txeof(struct em_softc *sc) sc->next_tx_to_clean = first; - /* - * If we have enough room, clear IFF_OACTIVE to tell the stack - * that it is OK to send packets. - * If there are no pending descriptors, clear the timeout. Otherwise, - * if some descriptors have been freed, restart the timeout. - */ - if (num_avail > EM_TX_CLEANUP_THRESHOLD) - ifq_clr_oactive(&ifp->if_snd); + num_avail = atomic_add_int_nv(&sc->num_tx_desc_avail, free); /* All clean, turn off the timer */ if (num_avail == sc->num_tx_desc) ifp->if_timer = 0; - /* Some cleaned, reset the timer */ - else if (num_avail != sc->num_tx_desc_avail) - ifp->if_timer = EM_TX_TIMEOUT; - sc->num_tx_desc_avail = num_avail; - - KERNEL_UNLOCK(); + /* + * If we have enough room, clear IFF_OACTIVE to tell the stack + * that it is OK to send packets. + */ + if (ifq_is_oactive(&ifp->if_snd) && num_avail > EM_TX_OP_THRESHOLD) { + ifq_clr_oactive(&ifp->if_snd); + KERNEL_LOCK(); + em_start(ifp); + KERNEL_UNLOCK(); + } } /********************************************************************* Index: pci/if_em.h =================================================================== RCS file: /cvs/src/sys/dev/pci/if_em.h,v retrieving revision 1.61 diff -u -p -r1.61 if_em.h --- pci/if_em.h 24 Nov 2015 17:11:39 -0000 1.61 +++ pci/if_em.h 3 Dec 2015 22:14:43 -0000 @@ -49,6 +49,7 @@ POSSIBILITY OF SUCH DAMAGE. #include <sys/device.h> #include <sys/socket.h> #include <sys/timeout.h> +#include <sys/atomic.h> #include <net/if.h> #include <net/if_media.h> @@ -181,10 +182,9 @@ typedef int boolean_t; #define EM_TX_TIMEOUT 5 /* set to 5 seconds */ /* - * These parameters control when the driver calls the routine to reclaim - * transmit descriptors. + * Thise parameter controls the minimum number of available transmit + * descriptors needed before we attempt transmission of a packet. */ -#define EM_TX_CLEANUP_THRESHOLD (sc->num_tx_desc / 8) #define EM_TX_OP_THRESHOLD (sc->num_tx_desc / 32) /* @@ -350,8 +350,8 @@ struct em_softc { struct em_tx_desc *tx_desc_base; u_int32_t next_avail_tx_desc; u_int32_t next_tx_to_clean; - volatile u_int16_t num_tx_desc_avail; - u_int16_t num_tx_desc; + volatile u_int32_t num_tx_desc_avail; + u_int32_t num_tx_desc; u_int32_t txd_cmd; struct em_buffer *tx_buffer_area; bus_dma_tag_t txtag; /* dma tag for tx */