iwx(4) has an issue which occurs very occasionally for me (every couple
of days or sometimes even weeks) where ssh(1) fails to connect until the
interface is reset with ifconfig down/up.

The initial protocol exchange with the SSH server succeeds, but as soon
as the client enters interactive state and sets TCP_NODELAY the connection
will hang.
While the interface is in this broken state other traffic such as ping
still makes it through, but any new ssh connection attempts show the
same symptom. It is just the interactive SSH packets which are stuck.

TCP_NODELAY packets are mapped to a special QoS TID by net80211, and each
such TID gets mapped to a dedicated Tx aggregation queue by the driver.
Combined with the observation that the Linux driver carries a workaround
for "stuck" individual Tx queues, we are likely facing a situation where
only one Tx queue in the device has stopped working.

Our interface watchdog cannot detect single Tx queue failures at present.
It is satisfied as long as packets make it out on any queue.
With the patch below we use a Tx timer per queue which will hopefully result
in a device timeout when the problem occurs. A manual reset will no longer be
required to unwedge things since the watchdog should now trigger a reset.

It is possible that other hangs which have been reported are related to
this but show different symptoms depending on which Tx queue gets stuck.
If the default Tx agg queue gets stuck it will become impossible to send
data packets that are not marked as TCP_NODELAY. If you are connected over
ssh while this happens the open ssh session might prevent the watchdog from
triggering and no new connections can be made.

I wish we had a fix for Tx queues getting stuck in the first place but this
is the best I can do.

ok?


diff 02c3ac519701a4fe198f8ee3de592b34a39ee6f7 
9b169d95d510ed2f3749ba598aa1cdb81cbff623
blob - 5cac83fd48ebe997a4cd7842730b0cb05f7c28ee
blob + c7dcd675c46e07ae4150a6dfdb90a189187dd5b2
--- sys/dev/pci/if_iwx.c
+++ sys/dev/pci/if_iwx.c
@@ -4552,8 +4552,6 @@ iwx_rx_tx_cmd(struct iwx_softc *sc, struct iwx_rx_pack
        bus_dmamap_sync(sc->sc_dmat, data->map, 0, IWX_RBUF_SIZE,
            BUS_DMASYNC_POSTREAD);
 
-       sc->sc_tx_timer = 0;
-
        /* Sanity checks. */
        if (sizeof(*tx_resp) > len)
                return;
@@ -4563,6 +4561,8 @@ iwx_rx_tx_cmd(struct iwx_softc *sc, struct iwx_rx_pack
            tx_resp->frame_count * sizeof(tx_resp->status) > len)
                return;
 
+       sc->sc_tx_timer[qid] = 0;
+
        if (tx_resp->frame_count > 1) /* A-MPDU */
                return;
 
@@ -4658,7 +4658,7 @@ iwx_rx_compressed_ba(struct iwx_softc *sc, struct iwx_
                idx = le16toh(ba_tfd->tfd_index);
                if (idx >= IWX_TX_RING_COUNT)
                        continue;
-               sc->sc_tx_timer = 0;
+               sc->sc_tx_timer[qid] = 0;
                iwx_txq_advance(sc, ring, idx);
                iwx_clear_oactive(sc, ring);
        }
@@ -5433,6 +5433,9 @@ iwx_tx(struct iwx_softc *sc, struct mbuf *m, struct ie
                sc->qfullmsk |= 1 << ring->qid;
        }
 
+       if (ic->ic_if.if_flags & IFF_UP)
+               sc->sc_tx_timer[ring->qid] = 15;
+
        return 0;
 }
 
@@ -7973,10 +7976,8 @@ iwx_start(struct ifnet *ifp)
                        continue;
                }
 
-               if (ifp->if_flags & IFF_UP) {
-                       sc->sc_tx_timer = 15;
+               if (ifp->if_flags & IFF_UP)
                        ifp->if_timer = 1;
-               }
        }
 
        return;
@@ -8045,7 +8046,8 @@ iwx_stop(struct ifnet *ifp)
                struct iwx_rxba_data *rxba = &sc->sc_rxba_data[i];
                iwx_clear_reorder_buffer(sc, rxba);
        }
-       ifp->if_timer = sc->sc_tx_timer = 0;
+       memset(sc->sc_tx_timer, 0, sizeof(sc->sc_tx_timer));
+       ifp->if_timer = 0;
 
        splx(s);
 }
@@ -8054,21 +8056,30 @@ void
 iwx_watchdog(struct ifnet *ifp)
 {
        struct iwx_softc *sc = ifp->if_softc;
+       int i;
 
        ifp->if_timer = 0;
-       if (sc->sc_tx_timer > 0) {
-               if (--sc->sc_tx_timer == 0) {
-                       printf("%s: device timeout\n", DEVNAME(sc));
-                       if (ifp->if_flags & IFF_DEBUG) {
-                               iwx_nic_error(sc);
-                               iwx_dump_driver_status(sc);
+
+       /*
+        * We maintain a separate timer for each Tx queue because
+        * Tx aggregation queues can get "stuck" while other queues
+        * keep working. The Linux driver uses a similar workaround.
+        */
+       for (i = 0; i < nitems(sc->sc_tx_timer); i++) {
+               if (sc->sc_tx_timer[i] > 0) {
+                       if (--sc->sc_tx_timer[i] == 0) {
+                               printf("%s: device timeout\n", DEVNAME(sc));
+                               if (ifp->if_flags & IFF_DEBUG) {
+                                       iwx_nic_error(sc);
+                                       iwx_dump_driver_status(sc);
+                               }
+                               if ((sc->sc_flags & IWX_FLAG_SHUTDOWN) == 0)
+                                       task_add(systq, &sc->init_task);
+                               ifp->if_oerrors++;
+                               return;
                        }
-                       if ((sc->sc_flags & IWX_FLAG_SHUTDOWN) == 0)
-                               task_add(systq, &sc->init_task);
-                       ifp->if_oerrors++;
-                       return;
+                       ifp->if_timer = 1;
                }
-               ifp->if_timer = 1;
        }
 
        ieee80211_watchdog(ifp);
blob - 009848fc25471ad90717094d1f67d2f4d22a3d6a
blob + bcedf33e0861fb1fcecb4c8af3e209837e03e535
--- sys/dev/pci/if_iwxvar.h
+++ sys/dev/pci/if_iwxvar.h
@@ -563,7 +563,7 @@ struct iwx_softc {
        struct iwx_nvm_data sc_nvm;
        struct iwx_bf_data sc_bf;
 
-       int sc_tx_timer;
+       int sc_tx_timer[IWX_LAST_AGG_TX_QUEUE];
        int sc_rx_ba_sessions;
 
        int sc_scan_last_antenna;

Reply via email to