On Sat, Dec 05, 2015 at 03:41:24PM +0100, Claudio Jeker wrote:
> So Mark and I spent some time to figure out what the issue was with ix(4)
> based on that info I resurected the em(4) mpsafe diff that got backed out
> and I applied the same fix. It is somewhat unclear if this fixes the
> watchdog timeouts since in theory the wdog timer should be stopped when
> hitting the race condition we hit in ix(4).
> 
> I'm currently hammering my test system with this and until now I have not
> seen a watchdog fired.

i built on this diff to try and mark em_start as mpsafe and came
up with the following.

the start/txeof interaction should be simpler with this so maybe
any watchdog issues should be resolved.

tests? ok?

Index: if_em.c
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_em.c,v
retrieving revision 1.313
diff -u -p -r1.313 if_em.c
--- if_em.c     25 Nov 2015 03:09:59 -0000      1.313
+++ if_em.c     29 Dec 2015 10:46:35 -0000
@@ -230,7 +230,7 @@ void em_txeof(struct em_softc *);
 int  em_allocate_receive_structures(struct em_softc *);
 int  em_allocate_transmit_structures(struct em_softc *);
 int  em_rxfill(struct em_softc *);
-void em_rxeof(struct em_softc *);
+int  em_rxeof(struct em_softc *);
 void em_receive_checksum(struct em_softc *, struct em_rx_desc *,
                         struct mbuf *);
 void em_transmit_checksum_setup(struct em_softc *, struct mbuf *,
@@ -588,15 +588,14 @@ err_pci:
 void
 em_start(struct ifnet *ifp)
 {
-       struct mbuf    *m_head;
        struct em_softc *sc = ifp->if_softc;
-       int             post = 0;
-
-       if (!(ifp->if_flags & IFF_RUNNING) || ifq_is_oactive(&ifp->if_snd))
-               return;
+       struct mbuf *m;
+       int post = 0;
 
-       if (!sc->link_active)
+       if (!sc->link_active) {
+               IFQ_PURGE(&ifp->if_snd);
                return;
+       }
 
        if (sc->hw.mac_type != em_82547) {
                bus_dmamap_sync(sc->txdma.dma_tag, sc->txdma.dma_map, 0,
@@ -605,22 +604,24 @@ em_start(struct ifnet *ifp)
        }
 
        for (;;) {
-               m_head = ifq_deq_begin(&ifp->if_snd);
-               if (m_head == NULL)
-                       break;
-
-               if (em_encap(sc, m_head)) {
-                       ifq_deq_rollback(&ifp->if_snd, m_head);
+               if (EM_MAX_SCATTER + 1 > sc->num_tx_desc_avail) {
                        ifq_set_oactive(&ifp->if_snd);
                        break;
                }
 
-               ifq_deq_commit(&ifp->if_snd, m_head);
+               m = ifq_dequeue(&ifp->if_snd);
+               if (m == NULL)
+                       break;
+
+               if (em_encap(sc, m) != 0) {
+                       /* ifp->if_oerrors++; */
+                       continue;
+               }
 
 #if NBPFILTER > 0
                /* Send a copy of the frame to the BPF listener */
                if (ifp->if_bpf)
-                       bpf_mtap_ether(ifp->if_bpf, m_head, BPF_DIRECTION_OUT);
+                       bpf_mtap_ether(ifp->if_bpf, m, BPF_DIRECTION_OUT);
 #endif
 
                /* Set timeout in case hardware has problems transmitting */
@@ -901,7 +902,6 @@ em_intr(void *arg)
        struct em_softc *sc = arg;
        struct ifnet    *ifp = &sc->interface_data.ac_if;
        u_int32_t       reg_icr, test_icr;
-       int             refill = 0;
 
        test_icr = reg_icr = E1000_READ_REG(&sc->hw, ICR);
        if (sc->hw.mac_type >= em_82571)
@@ -910,31 +910,23 @@ em_intr(void *arg)
                return (0);
 
        if (ifp->if_flags & IFF_RUNNING) {
-               em_rxeof(sc);
                em_txeof(sc);
-               refill = 1;
-       }
-
-       if (reg_icr & E1000_ICR_RXO)
-               sc->rx_overruns++;
 
-       KERNEL_LOCK();
+               if (em_rxeof(sc) || ISSET(reg_icr, E1000_ICR_RXO)) {
+                       if (em_rxfill(sc)) {
+                               E1000_WRITE_REG(&sc->hw, RDT,
+                                   sc->last_rx_desc_filled);
+                       }
+               }
+       }
 
        /* Link status change */
        if (reg_icr & (E1000_ICR_RXSEQ | E1000_ICR_LSC)) {
+               KERNEL_LOCK();
                sc->hw.get_link_status = 1;
                em_check_for_link(&sc->hw);
                em_update_link_status(sc);
-       }
-
-       if (ifp->if_flags & IFF_RUNNING && !IFQ_IS_EMPTY(&ifp->if_snd))
-               em_start(ifp);
-
-       KERNEL_UNLOCK();
-
-       if (refill && em_rxfill(sc)) {
-               /* Advance the Rx Queue #0 "Tail Pointer". */
-               E1000_WRITE_REG(&sc->hw, RDT, sc->last_rx_desc_filled);
+               KERNEL_UNLOCK();
        }
 
        return (1);
@@ -1096,7 +1088,7 @@ int
 em_encap(struct em_softc *sc, struct mbuf *m_head)
 {
        u_int32_t       txd_upper;
-       u_int32_t       txd_lower, txd_used = 0, txd_saved = 0;
+       u_int32_t       txd_lower, txd_used = 0;
        int             i, j, first, error = 0, last = 0;
        bus_dmamap_t    map;
 
@@ -1109,25 +1101,6 @@ em_encap(struct em_softc *sc, struct mbu
        struct em_tx_desc *current_tx_desc = NULL;
 
        /*
-        * Force a cleanup if number of TX descriptors
-        * available hits the threshold
-        */
-       if (sc->num_tx_desc_avail <= EM_TX_CLEANUP_THRESHOLD) {
-               em_txeof(sc);
-               /* Now do we at least have a minimal? */
-               if (sc->num_tx_desc_avail <= EM_TX_OP_THRESHOLD) {
-                       sc->no_tx_desc_avail1++;
-                       return (ENOBUFS);
-               }
-       }
-
-       if (sc->hw.mac_type == em_82547) {
-               bus_dmamap_sync(sc->txdma.dma_tag, sc->txdma.dma_map, 0,
-                   sc->txdma.dma_map->dm_mapsize,
-                   BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
-       }
-
-       /*
         * Map the packet for DMA.
         *
         * Capture the first descriptor index,
@@ -1153,13 +1126,14 @@ em_encap(struct em_softc *sc, struct mbu
                /* FALLTHROUGH */
        default:
                sc->no_tx_dma_setup++;
-               goto loaderr;
+               return (error);
        }
 
-       EM_KASSERT(map->dm_nsegs!= 0, ("em_encap: empty packet"));
-
-       if (map->dm_nsegs > sc->num_tx_desc_avail - 2)
-               goto fail;
+       if (sc->hw.mac_type == em_82547) {
+               bus_dmamap_sync(sc->txdma.dma_tag, sc->txdma.dma_map, 0,
+                   sc->txdma.dma_map->dm_mapsize,
+                   BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
+       }
 
        if (sc->hw.mac_type >= em_82543 && sc->hw.mac_type != em_82575 &&
            sc->hw.mac_type != em_82580 && sc->hw.mac_type != em_i210 &&
@@ -1169,8 +1143,6 @@ em_encap(struct em_softc *sc, struct mbu
                txd_upper = txd_lower = 0;
 
        i = sc->next_avail_tx_desc;
-       if (sc->pcix_82544)
-               txd_saved = i;
 
        for (j = 0; j < map->dm_nsegs; j++) {
                /* If sc is 82544 and on PCI-X bus */
@@ -1179,14 +1151,10 @@ em_encap(struct em_softc *sc, struct mbu
                         * Check the Address and Length combination and
                         * split the data accordingly
                         */
-                       array_elements = 
em_fill_descriptors(map->dm_segs[j].ds_addr,
-                                                            
map->dm_segs[j].ds_len,
-                                                            &desc_array);
+                       array_elements = em_fill_descriptors(
+                           map->dm_segs[j].ds_addr, map->dm_segs[j].ds_len,
+                           &desc_array);
                        for (counter = 0; counter < array_elements; counter++) {
-                               if (txd_used == sc->num_tx_desc_avail) {
-                                       sc->next_avail_tx_desc = txd_saved;
-                                       goto fail;
-                               }
                                tx_buffer = &sc->tx_buffer_area[i];
                                current_tx_desc = &sc->tx_desc_base[i];
                                current_tx_desc->buffer_addr = htole64(
@@ -1220,12 +1188,6 @@ em_encap(struct em_softc *sc, struct mbu
                }
        }
 
-       sc->next_avail_tx_desc = i;
-       if (sc->pcix_82544)
-               sc->num_tx_desc_avail -= txd_used;
-       else
-               sc->num_tx_desc_avail -= map->dm_nsegs;
-
 #if NVLAN > 0
        /* Find out if we are in VLAN mode */
        if (m_head->m_flags & M_VLANTAG) {
@@ -1259,6 +1221,12 @@ em_encap(struct em_softc *sc, struct mbu
        tx_buffer = &sc->tx_buffer_area[first];
        tx_buffer->next_eop = last;
 
+       sc->next_avail_tx_desc = i;
+       if (sc->pcix_82544)
+               atomic_sub_int(&sc->num_tx_desc_avail, txd_used);
+       else
+               atomic_sub_int(&sc->num_tx_desc_avail, map->dm_nsegs);
+
        /* 
         * Advance the Transmit Descriptor Tail (Tdt),
         * this tells the E1000 that this frame is
@@ -1277,18 +1245,6 @@ em_encap(struct em_softc *sc, struct mbu
        }
 
        return (0);
-
-fail:
-       sc->no_tx_desc_avail2++;
-       bus_dmamap_unload(sc->txtag, map);
-       error = ENOBUFS;
-loaderr:
-       if (sc->hw.mac_type == em_82547) {
-               bus_dmamap_sync(sc->txdma.dma_tag, sc->txdma.dma_map, 0,
-                   sc->txdma.dma_map->dm_mapsize,
-                   BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
-       }
-       return (error);
 }
 
 /*********************************************************************
@@ -1558,8 +1514,6 @@ em_stop(void *arg, int softonly)
 
        /* Tell the stack that the interface is no longer active */
        ifp->if_flags &= ~IFF_RUNNING;
-       ifq_clr_oactive(&ifp->if_snd);
-       ifp->if_timer = 0;
 
        INIT_DEBUGOUT("em_stop: begin");
 
@@ -1572,9 +1526,13 @@ em_stop(void *arg, int softonly)
        }
 
        intr_barrier(sc->sc_intrhand);
+       ifq_barrier(&ifp->if_snd);
 
        KASSERT((ifp->if_flags & IFF_RUNNING) == 0);
 
+       ifq_clr_oactive(&ifp->if_snd);
+       ifp->if_timer = 0;
+
        em_free_transmit_structures(sc);
        em_free_receive_structures(sc);
 }
@@ -1877,6 +1835,7 @@ em_setup_interface(struct em_softc *sc)
        strlcpy(ifp->if_xname, sc->sc_dv.dv_xname, IFNAMSIZ);
        ifp->if_softc = sc;
        ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
+       ifp->if_xflags = IFXF_MPSAFE;
        ifp->if_ioctl = em_ioctl;
        ifp->if_start = em_start;
        ifp->if_watchdog = em_watchdog;
@@ -2392,7 +2351,7 @@ em_transmit_checksum_setup(struct em_sof
        if (++curr_txd == sc->num_tx_desc)
                curr_txd = 0;
 
-       sc->num_tx_desc_avail--;
+       atomic_dec_int(&sc->num_tx_desc_avail);
        sc->next_avail_tx_desc = curr_txd;
 }
 
@@ -2406,7 +2365,7 @@ em_transmit_checksum_setup(struct em_sof
 void
 em_txeof(struct em_softc *sc)
 {
-       int first, last, done, num_avail;
+       int first, last, done, num_avail, free = 0;
        struct em_buffer *tx_buffer;
        struct em_tx_desc   *tx_desc, *eop_desc;
        struct ifnet   *ifp = &sc->interface_data.ac_if;
@@ -2414,9 +2373,6 @@ em_txeof(struct em_softc *sc)
        if (sc->num_tx_desc_avail == sc->num_tx_desc)
                return;
 
-       KERNEL_LOCK();
-
-       num_avail = sc->num_tx_desc_avail;
        first = sc->next_tx_to_clean;
        tx_desc = &sc->tx_desc_base[first];
        tx_buffer = &sc->tx_buffer_area[first];
@@ -2440,7 +2396,7 @@ em_txeof(struct em_softc *sc)
                while (first != done) {
                        tx_desc->upper.data = 0;
                        tx_desc->lower.data = 0;
-                       num_avail++;
+                       free++;
 
                        if (tx_buffer->m_head != NULL) {
                                ifp->if_opackets++;
@@ -2480,25 +2436,12 @@ em_txeof(struct em_softc *sc)
 
        sc->next_tx_to_clean = first;
 
-       /*
-        * If we have enough room, clear IFF_OACTIVE to tell the stack
-        * that it is OK to send packets.
-        * If there are no pending descriptors, clear the timeout. Otherwise,
-        * if some descriptors have been freed, restart the timeout.
-        */
-       if (num_avail > EM_TX_CLEANUP_THRESHOLD)
-               ifq_clr_oactive(&ifp->if_snd);
+       num_avail = atomic_add_int_nv(&sc->num_tx_desc_avail, free);
 
-       /* All clean, turn off the timer */
-       if (num_avail == sc->num_tx_desc)
+       if (ifq_is_oactive(&ifp->if_snd))
+               ifq_restart(&ifp->if_snd);
+       else if (num_avail == sc->num_tx_desc)
                ifp->if_timer = 0;
-       /* Some cleaned, reset the timer */
-       else if (num_avail != sc->num_tx_desc_avail)
-               ifp->if_timer = EM_TX_TIMEOUT;
-
-       sc->num_tx_desc_avail = num_avail;
-
-       KERNEL_UNLOCK();
 }
 
 /*********************************************************************
@@ -2813,7 +2756,7 @@ em_rxfill(struct em_softc *sc)
  *  dma'ed into host memory to upper layer.
  *
  *********************************************************************/
-void
+int
 em_rxeof(struct em_softc *sc)
 {
        struct ifnet        *ifp = &sc->interface_data.ac_if;
@@ -2822,7 +2765,7 @@ em_rxeof(struct em_softc *sc)
        u_int8_t            accept_frame = 0;
        u_int8_t            eop = 0;
        u_int16_t           len, desc_len, prev_len_adj;
-       int                 i;
+       int                 i, rv = 0;
 
        /* Pointer to the receive descriptor being examined. */
        struct em_rx_desc   *desc;
@@ -2830,7 +2773,7 @@ em_rxeof(struct em_softc *sc)
        u_int8_t            status;
 
        if (if_rxr_inuse(&sc->rx_ring) == 0)
-               return;
+               return (0);
 
        i = sc->next_rx_desc_to_check;
 
@@ -2863,6 +2806,7 @@ em_rxeof(struct em_softc *sc)
                }
 
                if_rxr_put(&sc->rx_ring, 1);
+               rv = 1;
 
                accept_frame = 1;
                prev_len_adj = 0;
@@ -2968,6 +2912,8 @@ em_rxeof(struct em_softc *sc)
        sc->next_rx_desc_to_check = i;
 
        if_input(ifp, &ml);
+
+       return (rv);
 }
 
 /*********************************************************************
Index: if_em.h
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_em.h,v
retrieving revision 1.61
diff -u -p -r1.61 if_em.h
--- if_em.h     24 Nov 2015 17:11:39 -0000      1.61
+++ if_em.h     29 Dec 2015 10:46:35 -0000
@@ -49,6 +49,7 @@ POSSIBILITY OF SUCH DAMAGE.
 #include <sys/device.h>
 #include <sys/socket.h>
 #include <sys/timeout.h>
+#include <sys/atomic.h>
 
 #include <net/if.h>
 #include <net/if_media.h>
@@ -181,10 +182,9 @@ typedef int        boolean_t;
 #define EM_TX_TIMEOUT                  5       /* set to 5 seconds */
 
 /*
- * These parameters control when the driver calls the routine to reclaim
- * transmit descriptors.
+ * Thise parameter controls the minimum number of available transmit
+ * descriptors needed before we attempt transmission of a packet.
  */
-#define EM_TX_CLEANUP_THRESHOLD                (sc->num_tx_desc / 8)
 #define EM_TX_OP_THRESHOLD             (sc->num_tx_desc / 32)
 
 /*
@@ -350,8 +350,8 @@ struct em_softc {
        struct em_tx_desc       *tx_desc_base;
        u_int32_t               next_avail_tx_desc;
        u_int32_t               next_tx_to_clean;
-       volatile u_int16_t      num_tx_desc_avail;
-       u_int16_t               num_tx_desc;
+       volatile u_int32_t      num_tx_desc_avail;
+       u_int32_t               num_tx_desc;
        u_int32_t               txd_cmd;
        struct em_buffer        *tx_buffer_area;
        bus_dma_tag_t           txtag;          /* dma tag for tx */

Reply via email to