iwx(4) has an issue which occurs very occasionally for me (every couple of days or sometimes even weeks) where ssh(1) fails to connect until the interface is reset with ifconfig down/up.
The initial protocol exchange with the SSH server succeeds, but as soon as the client enters interactive state and sets TCP_NODELAY the connection will hang. While the interface is in this broken state other traffic such as ping still makes it through, but any new ssh connection attempts show the same symptom. It is just the interactive SSH packets which are stuck. TCP_NODELAY packets are mapped to a special QoS TID by net80211, and each such TID gets mapped to a dedicated Tx aggregation queue by the driver. Combined with the observation that the Linux driver carries a workaround for "stuck" individual Tx queues, we are likely facing a situation where only one Tx queue in the device has stopped working. Our interface watchdog cannot detect single Tx queue failures at present. It is satisfied as long as packets make it out on any queue. With the patch below we use a Tx timer per queue which will hopefully result in a device timeout when the problem occurs. A manual reset will no longer be required to unwedge things since the watchdog should now trigger a reset. It is possible that other hangs which have been reported are related to this but show different symptoms depending on which Tx queue gets stuck. If the default Tx agg queue gets stuck it will become impossible to send data packets that are not marked as TCP_NODELAY. If you are connected over ssh while this happens the open ssh session might prevent the watchdog from triggering and no new connections can be made. I wish we had a fix for Tx queues getting stuck in the first place but this is the best I can do. ok? diff 02c3ac519701a4fe198f8ee3de592b34a39ee6f7 9b169d95d510ed2f3749ba598aa1cdb81cbff623 blob - 5cac83fd48ebe997a4cd7842730b0cb05f7c28ee blob + c7dcd675c46e07ae4150a6dfdb90a189187dd5b2 --- sys/dev/pci/if_iwx.c +++ sys/dev/pci/if_iwx.c @@ -4552,8 +4552,6 @@ iwx_rx_tx_cmd(struct iwx_softc *sc, struct iwx_rx_pack bus_dmamap_sync(sc->sc_dmat, data->map, 0, IWX_RBUF_SIZE, BUS_DMASYNC_POSTREAD); - sc->sc_tx_timer = 0; - /* Sanity checks. */ if (sizeof(*tx_resp) > len) return; @@ -4563,6 +4561,8 @@ iwx_rx_tx_cmd(struct iwx_softc *sc, struct iwx_rx_pack tx_resp->frame_count * sizeof(tx_resp->status) > len) return; + sc->sc_tx_timer[qid] = 0; + if (tx_resp->frame_count > 1) /* A-MPDU */ return; @@ -4658,7 +4658,7 @@ iwx_rx_compressed_ba(struct iwx_softc *sc, struct iwx_ idx = le16toh(ba_tfd->tfd_index); if (idx >= IWX_TX_RING_COUNT) continue; - sc->sc_tx_timer = 0; + sc->sc_tx_timer[qid] = 0; iwx_txq_advance(sc, ring, idx); iwx_clear_oactive(sc, ring); } @@ -5433,6 +5433,9 @@ iwx_tx(struct iwx_softc *sc, struct mbuf *m, struct ie sc->qfullmsk |= 1 << ring->qid; } + if (ic->ic_if.if_flags & IFF_UP) + sc->sc_tx_timer[ring->qid] = 15; + return 0; } @@ -7973,10 +7976,8 @@ iwx_start(struct ifnet *ifp) continue; } - if (ifp->if_flags & IFF_UP) { - sc->sc_tx_timer = 15; + if (ifp->if_flags & IFF_UP) ifp->if_timer = 1; - } } return; @@ -8045,7 +8046,8 @@ iwx_stop(struct ifnet *ifp) struct iwx_rxba_data *rxba = &sc->sc_rxba_data[i]; iwx_clear_reorder_buffer(sc, rxba); } - ifp->if_timer = sc->sc_tx_timer = 0; + memset(sc->sc_tx_timer, 0, sizeof(sc->sc_tx_timer)); + ifp->if_timer = 0; splx(s); } @@ -8054,21 +8056,30 @@ void iwx_watchdog(struct ifnet *ifp) { struct iwx_softc *sc = ifp->if_softc; + int i; ifp->if_timer = 0; - if (sc->sc_tx_timer > 0) { - if (--sc->sc_tx_timer == 0) { - printf("%s: device timeout\n", DEVNAME(sc)); - if (ifp->if_flags & IFF_DEBUG) { - iwx_nic_error(sc); - iwx_dump_driver_status(sc); + + /* + * We maintain a separate timer for each Tx queue because + * Tx aggregation queues can get "stuck" while other queues + * keep working. The Linux driver uses a similar workaround. + */ + for (i = 0; i < nitems(sc->sc_tx_timer); i++) { + if (sc->sc_tx_timer[i] > 0) { + if (--sc->sc_tx_timer[i] == 0) { + printf("%s: device timeout\n", DEVNAME(sc)); + if (ifp->if_flags & IFF_DEBUG) { + iwx_nic_error(sc); + iwx_dump_driver_status(sc); + } + if ((sc->sc_flags & IWX_FLAG_SHUTDOWN) == 0) + task_add(systq, &sc->init_task); + ifp->if_oerrors++; + return; } - if ((sc->sc_flags & IWX_FLAG_SHUTDOWN) == 0) - task_add(systq, &sc->init_task); - ifp->if_oerrors++; - return; + ifp->if_timer = 1; } - ifp->if_timer = 1; } ieee80211_watchdog(ifp); blob - 009848fc25471ad90717094d1f67d2f4d22a3d6a blob + bcedf33e0861fb1fcecb4c8af3e209837e03e535 --- sys/dev/pci/if_iwxvar.h +++ sys/dev/pci/if_iwxvar.h @@ -563,7 +563,7 @@ struct iwx_softc { struct iwx_nvm_data sc_nvm; struct iwx_bf_data sc_bf; - int sc_tx_timer; + int sc_tx_timer[IWX_LAST_AGG_TX_QUEUE]; int sc_rx_ba_sessions; int sc_scan_last_antenna;