Re: better run(4) fix (was: run(4): fix crash in run_task())

2015-07-15 Thread Stefan Sperling
On Tue, Jul 14, 2015 at 09:56:53PM -0600, Stefan Sperling wrote:
 This implements your suggestion to properly abort async tasks when
 bringing the interface down.
 
 Fixes the problem for me just as well.

And yet another diff after more discussion.

This diff makes the driver schedule a single 80211 state transition at a time.
The command queue is now used for commands to the device firmware only.

A task scheduled by the scan timeout will now be overwritten when run_stop()
schedules a transition to INIT. So there cannot be a SCAN-SCAN transition
that restarts the timeout while the interface goes down.

If this pattern works out we could move storage and management of this task
up into the net80211 layer eventyally, and make use of it in more drivers.

Index: if_run.c
===
RCS file: /cvs/src/sys/dev/usb/if_run.c,v
retrieving revision 1.109
diff -u -p -r1.109 if_run.c
--- if_run.c12 Jun 2015 15:47:31 -  1.109
+++ if_run.c15 Jul 2015 19:54:40 -
@@ -34,6 +34,7 @@
 #include sys/conf.h
 #include sys/device.h
 #include sys/endian.h
+#include sys/task.h
 
 #include machine/intr.h
 
@@ -361,7 +362,7 @@ voidrun_task(void *);
 void   run_do_async(struct run_softc *, void (*)(struct run_softc *,
void *), void *, int);
 intrun_newstate(struct ieee80211com *, enum ieee80211_state, int);
-void   run_newstate_cb(struct run_softc *, void *);
+void   run_newstate_task(void *);
 void   run_updateedca(struct ieee80211com *);
 void   run_updateedca_cb(struct run_softc *, void *);
 intrun_set_key(struct ieee80211com *, struct ieee80211_node *,
@@ -568,6 +569,7 @@ run_attach(struct device *parent, struct
}
 
usb_init_task(sc-sc_task, run_task, sc, USB_TASK_TYPE_GENERIC);
+   task_set(sc-sc_newstate_task, run_newstate_task, sc);
timeout_set(sc-scan_to, run_next_scan, sc);
timeout_set(sc-calib_to, run_calibrate_to, sc);
 
@@ -685,6 +687,8 @@ run_detach(struct device *self, int flag
 
s = splusb();
 
+   task_del(systq, sc-sc_newstate_task);
+
if (timeout_initialized(sc-scan_to))
timeout_del(sc-scan_to);
if (timeout_initialized(sc-calib_to))
@@ -1767,25 +1771,30 @@ int
 run_newstate(struct ieee80211com *ic, enum ieee80211_state nstate, int arg)
 {
struct run_softc *sc = ic-ic_softc;
-   struct run_cmd_newstate cmd;
+   struct run_newstate_task_arg *task_arg = sc-sc_newstate_task_arg;
+
+   /* remove any already scheduled transition */
+   task_del(systq, sc-sc_newstate_task);
+
+   /* schedule transition in a process context */
+   task_arg-state = nstate;
+   task_arg-arg = arg;
+   task_add(systq, sc-sc_newstate_task);
 
-   /* do it in a process context */
-   cmd.state = nstate;
-   cmd.arg = arg;
-   run_do_async(sc, run_newstate_cb, cmd, sizeof cmd);
return 0;
 }
 
 void
-run_newstate_cb(struct run_softc *sc, void *arg)
+run_newstate_task(void *arg)
 {
-   struct run_cmd_newstate *cmd = arg;
+   struct run_softc *sc = arg;
+   struct run_newstate_task_arg *task_arg = sc-sc_newstate_task_arg;
struct ieee80211com *ic = sc-sc_ic;
enum ieee80211_state ostate;
struct ieee80211_node *ni;
uint32_t tmp, sta[3];
uint8_t wcid;
-   int s;
+   int s, nstate = task_arg-state;
 
s = splnet();
ostate = ic-ic_state;
@@ -1795,7 +1804,7 @@ run_newstate_cb(struct run_softc *sc, vo
run_set_leds(sc, RT2860_LED_RADIO);
}
 
-   switch (cmd-state) {
+   switch (nstate) {
case IEEE80211_S_INIT:
if (ostate == IEEE80211_S_RUN) {
/* abort TSF synchronization */
@@ -1855,7 +1864,7 @@ run_newstate_cb(struct run_softc *sc, vo
 RT2860_LED_LINK_2GHZ : RT2860_LED_LINK_5GHZ));
break;
}
-   (void)sc-sc_newstate(ic, cmd-state, cmd-arg);
+   (void)sc-sc_newstate(ic, task_arg-state, task_arg-arg);
splx(s);
 }
 



Re: better run(4) fix (was: run(4): fix crash in run_task())

2015-07-14 Thread Stefan Sperling
On Sun, Jul 12, 2015 at 09:11:44PM +0200, Stefan Sperling wrote:
 On Sun, Jul 12, 2015 at 05:57:14PM +0200, Martin Pieuchot wrote:
run_newstate_cb SCAN - INIT
run_newstate_cb SCAN - SCAN
  
  How is this possible?  Why it isn't INIT - SCAN?
 
 I'm not entirely sure.

This implements your suggestion to properly abort async tasks when
bringing the interface down.

Fixes the problem for me just as well.

ok?

Index: if_run.c
===
RCS file: /cvs/src/sys/dev/usb/if_run.c,v
retrieving revision 1.109
diff -u -p -r1.109 if_run.c
--- if_run.c12 Jun 2015 15:47:31 -  1.109
+++ if_run.c15 Jul 2015 03:50:08 -
@@ -358,6 +358,7 @@ struct  ieee80211_node *run_node_alloc(s
 intrun_media_change(struct ifnet *);
 void   run_next_scan(void *);
 void   run_task(void *);
+void   run_cancel_async(struct run_softc *);
 void   run_do_async(struct run_softc *, void (*)(struct run_softc *,
void *), void *, int);
 intrun_newstate(struct ieee80211com *, enum ieee80211_state, int);
@@ -685,13 +686,10 @@ run_detach(struct device *self, int flag
 
s = splusb();
 
-   if (timeout_initialized(sc-scan_to))
-   timeout_del(sc-scan_to);
if (timeout_initialized(sc-calib_to))
timeout_del(sc-calib_to);
 
-   /* wait for all queued asynchronous commands to complete */
-   usb_rem_wait_task(sc-sc_udev, sc-sc_task);
+   run_cancel_async(sc);
 
usbd_ref_wait(sc-sc_udev);
 
@@ -1740,6 +1738,28 @@ run_task(void *arg)
 }
 
 void
+run_cancel_async(struct run_softc *sc)
+{
+   struct run_host_cmd_ring *ring = sc-cmdq;
+   struct run_host_cmd *cmd;
+   int s, i;
+
+   s = splusb();
+
+   if (timeout_initialized(sc-scan_to))
+   timeout_del(sc-scan_to);
+   usb_rem_task(sc-sc_udev, sc-sc_task);
+
+   for (i = 0; i  RUN_HOST_CMD_RING_COUNT; i++) {
+   cmd = ring-cmd[i];
+   cmd-cb = NULL;
+   }
+   ring-next = ring-cur = ring-queued = 0;
+
+   splx(s);
+}
+
+void
 run_do_async(struct run_softc *sc, void (*cb)(struct run_softc *, void *),
 void *arg, int len)
 {
@@ -1751,6 +1771,11 @@ run_do_async(struct run_softc *sc, void 
return;
 
s = splusb();
+   if (ring-queued == RUN_HOST_CMD_RING_COUNT) {
+   splx(s);
+   printf(%s: host cmd queue overrun\n, sc-sc_dev.dv_xname);
+   return; /* XXX */
+   }
cmd = ring-cmd[ring-cur];
cmd-cb = cb;
KASSERT(len = sizeof (cmd-data));
@@ -4504,7 +4529,9 @@ run_init(struct ifnet *ifp)
}
 
/* init host command ring */
-   sc-cmdq.cur = sc-cmdq.next = sc-cmdq.queued = 0;
+   if (sc-cmdq.queued != 0)
+   panic(outstanding host commands queued);
+   sc-cmdq.cur = sc-cmdq.next = 0;
 
/* init Tx rings (4 EDCAs) */
for (qid = 0; qid  4; qid++) {
@@ -4735,13 +4762,16 @@ run_stop(struct ifnet *ifp, int disable)
ifp-if_timer = 0;
ifp-if_flags = ~(IFF_RUNNING | IFF_OACTIVE);
 
-   timeout_del(sc-scan_to);
timeout_del(sc-calib_to);
 
s = splusb();
+
+   /* Cancel asynchronous state transitions etc. */
+   run_cancel_async(sc);
ieee80211_new_state(ic, IEEE80211_S_INIT, -1);
-   /* wait for all queued asynchronous commands to complete */
+   /* Wait for asynchronous state transition to INIT to complete. */
usb_wait_task(sc-sc_udev, sc-sc_task);
+
splx(s);
 
/* Disable Tx/Rx DMA. */



Re: better run(4) fix (was: run(4): fix crash in run_task())

2015-07-12 Thread Stefan Sperling
On Sun, Jul 12, 2015 at 05:57:14PM +0200, Martin Pieuchot wrote:
   run_newstate_cb SCAN - INIT
   run_newstate_cb SCAN - SCAN
 
 How is this possible?  Why it isn't INIT - SCAN?

I'm not entirely sure.

I think one possible scenario is as follows:

 We're scanning. scan_to runs (every 200ms) and appends a state
 transition (i.e. run_newstate_cb()) for SCAN-SCAN on the cmdq
 to scan the next channel.

 Some process causes a call to run_stop() followed by run_init().
 (Note that 'ifconfig run0 mediaopt monitor up' starts a scan and
 then does IFF_UP. Somehow run_media_change runs, which does run_stop()
 followed by run_init(), but I haven't yet tracked down why -- perhaps
 to switch bands from 2GHz to 5GHz. But this seems to be an important
 ingrediant since the crash can't be trigged with 2GHz-only devices.)

 run_stop() calls timeout_del() to cancel scan_to.

 run_stop() calls ieee80211_new_state(), i.e. run_newstate(), which appends
 a state transition to INIT on the cmdq.

 run_stop() waits for run_usb_task to complete.

 usb_task_thread gets scheduled.

 run_task() runs, and executes the pending SCAN-SCAN transition first.
 This transition calls timeout_add on scan_to, effectively cancelling
 run_stop's call to timeout_del().
 After that, the SCAN-INIT transition is executed as well.

 usb_task_thread goes back to sleep.

 Now scan_to runs and puts run_newstate_cb() with for state transition
 SCAN-SCAN on the cmdq.

 run_stop() finishes up and returns.

 run_init() is called. It resets cmdq.{next,cur,queued} to zero,
 corrupting the cmdq.

 run_task() runs to execute the pending SCAN-SCAN transition.
 But cmdq is now corrupt and we crash.

If this is indeed what's happening, my diff ensures that no pending
SCAN-SCAN transition is on the cmdq before appending the transition
to INIT. However, the last SCAN-SCAN transition would still call
timeout_add for scan_to after run_stop called timeout_del. So the diff
may not be a bullet proof fix. Perhaps we should add a timeout_del
to run_newstate_cb if going back to INIT?

What do you think? Does this make any sense?
Are there other possiblities?

Perhaps we should try to find a whiteboard in calgary and draw
some diagrams ;-)



Re: better run(4) fix (was: run(4): fix crash in run_task())

2015-07-12 Thread Martin Pieuchot
On 11/07/15(Sat) 15:16, Stefan Sperling wrote:
 On Sat, Jul 11, 2015 at 09:54:34AM +0200, Stefan Sperling wrote:
  While run_task() iterates through the host command queue during a scan,
  the interface may get reset via run_stop() and run_init() when switching
  bands (2GHz - 5GHz) in run_media_change().
  
  The list state gets corrupted because run_init() resets the list counters
  while run_task() is still iterating the list. run_task() will now attempt
  to call garbage or NULL callback pointers.
  
  run_newstate_cb SCAN - INIT
  run_newstate_cb SCAN - SCAN

How is this possible?  Why it isn't INIT - SCAN?

  run_newstate_cb SCAN - SCAN
  run_task ring-next=7
  run_task ring-cur=0
  run_task ring-queued=-7   -- negative nonesense
  run_task cmd=0x80651110
  run_task cmd-cb=0x0
  
  The kernel now explodes trying to call cmd-cb.
  
  This can be reproduced by running 'ifconfig run0 mediaopt monitor up'.
 
 My previous duff cured a symptom.
 I believe this new diff closes the actual race.

I'm really not found of either diffs

 State transitions can already be waiting to be scheduled while run_stop()
 is scheduling another transition trying to bring the device back to INIT.
 Each state transition happens asynchronously in a process context.
^^

I guess it happens only with USB drivers and the net80211 stack is not
suppose to work this way, right?

 So if multiple transitions are scheduled they can happen out of order.

I can think of two cases when such thing can happen: when a state
transition schedule another one or when a timeout fires.  Do you
know what happens in the run(4) case?

 When injecting a state transition from the side to force the device
 into a particular state, like run_stop() does, we need to make sure the
 net80211 stack is not concurrently trying to transition the driver for
 other purposes, such as the scanning loop.

What are the contexts of from the side and net80211 stack?

 This issue probably affects a number of other wifi drivers as well.
 If this fix is good to go I'll do a tree-wide sweep soon.

I'd like to understand what you're fix is fixing 8)  Are we chasing a
problem of ic_state not being synchronized with the reality because
some code in interrupt context scheduled a state transition or are we
forcing multiple state transitions to not interleave an INIT change?

In any case, if we want to go MP, we should have a better understanding
of these state transitions :)

 Please disregard all the other hackish patches I mailed out this morning.
 
 Index: if_run.c
 ===
 RCS file: /cvs/src/sys/dev/usb/if_run.c,v
 retrieving revision 1.109
 diff -u -p -r1.109 if_run.c
 --- if_run.c  12 Jun 2015 15:47:31 -  1.109
 +++ if_run.c  11 Jul 2015 13:00:21 -
 @@ -1751,6 +1751,11 @@ run_do_async(struct run_softc *sc, void 
   return;
  
   s = splusb();
 + if (ring-queued == RUN_HOST_CMD_RING_COUNT) {
 + splx(s);
 + printf(%s: host cmd queue overrun\n, sc-sc_dev.dv_xname);
 + return; /* XXX */
 + }
   cmd = ring-cmd[ring-cur];
   cmd-cb = cb;
   KASSERT(len = sizeof (cmd-data));
 @@ -4504,7 +4509,9 @@ run_init(struct ifnet *ifp)
   }
  
   /* init host command ring */
 - sc-cmdq.cur = sc-cmdq.next = sc-cmdq.queued = 0;
 + if (sc-cmdq.queued != 0)
 + panic(outstanding host commands queued);
 + sc-cmdq.cur = sc-cmdq.next = 0;
  
   /* init Tx rings (4 EDCAs) */
   for (qid = 0; qid  4; qid++) {
 @@ -4739,9 +4746,17 @@ run_stop(struct ifnet *ifp, int disable)
   timeout_del(sc-calib_to);
  
   s = splusb();
 +
 + /*
 +  * Wait for all queued asynchronous commands to complete
 +  * before switching to INIT state.
 +  */
 + usb_wait_task(sc-sc_udev, sc-sc_task);
 +
   ieee80211_new_state(ic, IEEE80211_S_INIT, -1);
 - /* wait for all queued asynchronous commands to complete */
 + /* Wait for asynchronous state transition to complete. */
   usb_wait_task(sc-sc_udev, sc-sc_task);
 +
   splx(s);
  
   /* Disable Tx/Rx DMA. */
 



better run(4) fix (was: run(4): fix crash in run_task())

2015-07-11 Thread Stefan Sperling
On Sat, Jul 11, 2015 at 09:54:34AM +0200, Stefan Sperling wrote:
 While run_task() iterates through the host command queue during a scan,
 the interface may get reset via run_stop() and run_init() when switching
 bands (2GHz - 5GHz) in run_media_change().
 
 The list state gets corrupted because run_init() resets the list counters
 while run_task() is still iterating the list. run_task() will now attempt
 to call garbage or NULL callback pointers.
 
 run_newstate_cb SCAN - INIT
 run_newstate_cb SCAN - SCAN
 run_newstate_cb SCAN - SCAN
 run_task ring-next=7
 run_task ring-cur=0
 run_task ring-queued=-7   -- negative nonesense
 run_task cmd=0x80651110
 run_task cmd-cb=0x0
 
 The kernel now explodes trying to call cmd-cb.
 
 This can be reproduced by running 'ifconfig run0 mediaopt monitor up'.

My previous duff cured a symptom.
I believe this new diff closes the actual race.

State transitions can already be waiting to be scheduled while run_stop()
is scheduling another transition trying to bring the device back to INIT.
Each state transition happens asynchronously in a process context.
So if multiple transitions are scheduled they can happen out of order.

When injecting a state transition from the side to force the device
into a particular state, like run_stop() does, we need to make sure the
net80211 stack is not concurrently trying to transition the driver for
other purposes, such as the scanning loop.

This issue probably affects a number of other wifi drivers as well.
If this fix is good to go I'll do a tree-wide sweep soon.

Please disregard all the other hackish patches I mailed out this morning.

Index: if_run.c
===
RCS file: /cvs/src/sys/dev/usb/if_run.c,v
retrieving revision 1.109
diff -u -p -r1.109 if_run.c
--- if_run.c12 Jun 2015 15:47:31 -  1.109
+++ if_run.c11 Jul 2015 13:00:21 -
@@ -1751,6 +1751,11 @@ run_do_async(struct run_softc *sc, void 
return;
 
s = splusb();
+   if (ring-queued == RUN_HOST_CMD_RING_COUNT) {
+   splx(s);
+   printf(%s: host cmd queue overrun\n, sc-sc_dev.dv_xname);
+   return; /* XXX */
+   }
cmd = ring-cmd[ring-cur];
cmd-cb = cb;
KASSERT(len = sizeof (cmd-data));
@@ -4504,7 +4509,9 @@ run_init(struct ifnet *ifp)
}
 
/* init host command ring */
-   sc-cmdq.cur = sc-cmdq.next = sc-cmdq.queued = 0;
+   if (sc-cmdq.queued != 0)
+   panic(outstanding host commands queued);
+   sc-cmdq.cur = sc-cmdq.next = 0;
 
/* init Tx rings (4 EDCAs) */
for (qid = 0; qid  4; qid++) {
@@ -4739,9 +4746,17 @@ run_stop(struct ifnet *ifp, int disable)
timeout_del(sc-calib_to);
 
s = splusb();
+
+   /*
+* Wait for all queued asynchronous commands to complete
+* before switching to INIT state.
+*/
+   usb_wait_task(sc-sc_udev, sc-sc_task);
+
ieee80211_new_state(ic, IEEE80211_S_INIT, -1);
-   /* wait for all queued asynchronous commands to complete */
+   /* Wait for asynchronous state transition to complete. */
usb_wait_task(sc-sc_udev, sc-sc_task);
+
splx(s);
 
/* Disable Tx/Rx DMA. */