While working on 40MHz support for iwm(4) I noticed that throughput
dropped from 200 Mbit/s to about 2 Mbit/s whenever the laptop roamed
between APs. Sometimes traffic even stalled completely.
The patch below fixes this problem for me.
I used an 8265 device to reproduce the problem and test my fix.
If you are able to test roaming between APs (with the same SSID on
all APs, and ideally the same layer2 network), please test this patch
on any of the following iwm(4) devices: 7260, 7265, 8260, 9260, 9560
Should anyone need additional motivation to help, I will send you my
40MHz support patch as soon as I see your test results ;-)
To establish a baseline, please test without the patch first and then
with the patch applied.
Use wifi with 'ifconfig iwm0 debug' enabled to make roaming attempts
appear in /var/log/messages, and watch this file with a command such as:
tail -f /var/log/messages
Move towards another access point and trigger a background scan by
running this command as root:
ifconfig iwm0 scan
It may take a few scan attempts until roaming triggers.
Successful roaming displays the following in /var/log/messages:
iwm0: roaming from 00:2b:a2:95:e3:e4 chan 1 to 00:2b:a2:95:e3:f4 chan 36
iwm0: RUN -> AUTH
iwm0: sending auth to 00:1a:dd:da:e3:f4 on channel 36 mode 11a
iwm0: AUTH -> ASSOC
iwm0: sending assoc_req to 00:2b:a2:95:e3:f4 on channel 36 mode 11a
iwm0: ASSOC -> RUN
If it doesn't do this but enters INIT or SCAN or whatever, roaming failed.
Note again that all APs involved need to use the same SSID for this to work
as intended.
After each AP switch, measure Tx throughput (i.e. traffic sent from the
laptop, not towards the laptop).
The choice of measurement tool doesn't really matter. For example,
tcpbench works; use it towards another computer which runs tcpbench -s.
Tx throughput should remain within expectations; after roaming, throughput
towards a given AP should match what you see after ifconfig iwm0 down/up
for the same AP. You can fix the channel or BSSID if you want to force a
particular access point for reference measurements:
ifconfig iwm0 chan 1
Clear with: ifconfig iwm0 -chan
ifconfig iwm0 bssid AP_MAC_ADDRESS (this disables roaming!)
Clear with: ifconfig iwm0 -bssid (re-enables roaming)
The problem was introduced when Tx aggregation support was added.
Firmware expectations about Tx queue handling were always a bit unclear
to me. At the time it seemed that the only way to avoid fatal firmware
errors was to leave Tx agg queues enabled until the device is reset.
Unfortunately, I missed that Tx agg queues end up with low throughput when
we switch to a new access point without going through a full device reset.
Just removing the AP from the firmware station table is not enough.
After some trial and error I figured out that when switching away
from our old access point, the following needs to happen, in order,
to have Tx aggregation queues work properly against the new AP:
- Explicitly stop Rx/Tx aggregation sessions.
- Now remove the firmware station that represents the AP.
- Now the Tx agg queue is marked "unused" in firmware and we are
allowed to disable it without getting a fatal firmware error.
Makes sense in hindsight...
diff refs/heads/master refs/heads/txagg-fix
blob - 8bf1224f792b4b4f1e1214660de9a211e3788d74
blob + 35c4b352f905c1e493d64ccd360e022a77111e1e
--- sys/dev/pci/if_iwm.c
+++ sys/dev/pci/if_iwm.c
@@ -307,6 +307,7 @@ int iwm_nic_init(struct iwm_softc *);
int iwm_enable_ac_txq(struct iwm_softc *, int, int);
int iwm_enable_txq(struct iwm_softc *, int, int, int, int, uint8_t,
uint16_t);
+int iwm_disable_txq(struct iwm_softc *, int, int, uint8_t);
int iwm_post_alive(struct iwm_softc *);
struct iwm_phy_db_entry *iwm_phy_db_get_section(struct iwm_softc *, uint16_t,
uint16_t);
@@ -2462,6 +2463,26 @@ iwm_enable_txq(struct iwm_softc *sc, int sta_id, int q
}
int
+iwm_disable_txq(struct iwm_softc *sc, int sta_id, int qid, uint8_t tid)
+{
+ struct iwm_scd_txq_cfg_cmd cmd;
+ int err;
+
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.tid = tid;
+ cmd.scd_queue = qid;
+ cmd.enable = 0;
+ cmd.sta_id = sta_id;
+
+ err = iwm_send_cmd_pdu(sc, IWM_SCD_QUEUE_CFG, 0, sizeof(cmd), &cmd);
+ if (err)
+ return err;
+
+ sc->qenablemsk &= ~(1 << qid);
+ return 0;
+}
+
+int
iwm_post_alive(struct iwm_softc *sc)
{
int nwords;
@@ -3397,7 +3418,6 @@ iwm_sta_tx_agg(struct iwm_softc *sc, struct ieee80211_
struct iwm_node *in = (void *)ni;
int qid = IWM_FIRST_AGG_TX_QUEUE + tid;
struct iwm_tx_ring *ring;
- struct ieee80211_tx_ba *ba;
enum ieee80211_edca_ac ac;
int fifo;
uint32_t status;
@@ -3417,7 +3437,6 @@ iwm_sta_tx_agg(struct iwm_softc *sc, struct ieee80211_
}
ring = &sc->txq[qid];
- ba = &ni->ni_tx_ba[tid];
ac = iwm_tid_to_ac[tid];
fifo = iwm_ac_to_tx_fifo[ac];
@@ -3434,7 +3453,10 @@ iwm_sta_tx_agg(struct iwm_softc *sc, struct ieee80211_
in->tfd_queue_msk |= (1 << qid);
} else {
in->tid_disable_ampdu |= (1 << tid);
- /* Queue remains enabled in the TFD queue mask. */
+ /*
+ * Queue remains enabled in the TFD queue mask
+ * until we leave RUN state.
+ */
err = iwm_flush_sta(sc, in);
if (err)
return err;
@@ -3503,8 +3525,7 @@ iwm_sta_tx_agg(struct iwm_softc *sc, struct ieee80211_
* Clear pending frames but keep the queue enabled.
* Firmware panics if we disable the queue here.
*/
- iwm_txq_advance(sc, ring,
- IWM_AGG_SSN_TO_TXQ_IDX(ba->ba_winend));
+ iwm_txq_advance(sc, ring, ring->cur);
iwm_clear_oactive(sc, ring);
}
@@ -3520,6 +3541,13 @@ iwm_ba_task(void *arg)
int s = splnet();
int tid, err = 0;
+ if ((sc->sc_flags & IWM_FLAG_SHUTDOWN) ||
+ ic->ic_state != IEEE80211_S_RUN) {
+ refcnt_rele_wake(&sc->task_refs);
+ splx(s);
+ return;
+ }
+
for (tid = 0; tid < IWM_MAX_TID_COUNT && !err; tid++) {
if (sc->sc_flags & IWM_FLAG_SHUTDOWN)
break;
@@ -8502,10 +8530,45 @@ iwm_run_stop(struct iwm_softc *sc)
{
struct ieee80211com *ic = &sc->sc_ic;
struct iwm_node *in = (void *)ic->ic_bss;
- int err;
+ struct ieee80211_node *ni = &in->in_ni;
+ int err, i, tid;
splassert(IPL_NET);
+ /*
+ * Stop Tx/Rx BA sessions now. We cannot rely on the BA task
+ * for this when moving out of RUN state since it runs in a
+ * separate thread.
+ * Note that in->in_ni (struct ieee80211_node) already represents
+ * our new access point in case we are roaming between APs.
+ * This means we cannot rely on struct ieee802111_node to tell
+ * us which BA sessions exist.
+ */
+ for (i = 0; i < nitems(sc->sc_rxba_data); i++) {
+ struct iwm_rxba_data *rxba = &sc->sc_rxba_data[i];
+ if (rxba->baid == IWM_RX_REORDER_DATA_INVALID_BAID)
+ continue;
+ err = iwm_sta_rx_agg(sc, ni, rxba->tid, 0, 0, 0, 0);
+ if (err)
+ return err;
+ iwm_clear_reorder_buffer(sc, rxba);
+ if (sc->sc_rx_ba_sessions > 0)
+ sc->sc_rx_ba_sessions--;
+ }
+ for (tid = 0; tid < IWM_MAX_TID_COUNT; tid++) {
+ int qid = IWM_FIRST_AGG_TX_QUEUE + tid;
+ if ((sc->tx_ba_queue_mask & (1 << qid)) == 0)
+ continue;
+ err = iwm_sta_tx_agg(sc, ni, tid, 0, 0, 0);
+ if (err)
+ return err;
+ err = iwm_disable_txq(sc, IWM_STATION_ID, qid, tid);
+ if (err)
+ return err;
+ in->tfd_queue_msk &= ~(1 << qid);
+ }
+ ieee80211_ba_del(ni);
+
if (ic->ic_opmode == IEEE80211_M_MONITOR)
iwm_led_blink_stop(sc);
@@ -8940,7 +9003,6 @@ iwm_newstate(struct ieee80211com *ic, enum ieee80211_s
{
struct ifnet *ifp = IC2IFP(ic);
struct iwm_softc *sc = ifp->if_softc;
- int i;
/*
* Prevent attemps to transition towards the same state, unless
@@ -8954,10 +9016,6 @@ iwm_newstate(struct ieee80211com *ic, enum ieee80211_s
timeout_del(&sc->sc_calib_to);
iwm_del_task(sc, systq, &sc->ba_task);
iwm_del_task(sc, systq, &sc->mac_ctxt_task);
- for (i = 0; i < nitems(sc->sc_rxba_data); i++) {
- struct iwm_rxba_data *rxba = &sc->sc_rxba_data[i];
- iwm_clear_reorder_buffer(sc, rxba);
- }
}
sc->ns_nstate = nstate;