Re: hung task in mac80211

2017-09-06 Thread Sebastian Gottschall

i confirm this patch fixes the issue for me as well

Am 06.09.2017 um 17:04 schrieb Matteo Croce:

On Wed, Sep 6, 2017 at 2:58 PM, Johannes Berg  wrote:

On Wed, 2017-09-06 at 13:57 +0200, Matteo Croce wrote:


I have an hung task on vanilla 4.13 kernel which I haven't on 4.12.
The problem is present both on my AP and on my notebook,
so it seems it affects AP and STA mode as well.
The generated messages are:

INFO: task kworker/u16:6:120 blocked for more than 120 seconds.
   Not tainted 4.13.0 #57
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this
message.
kworker/u16:6   D0   120  2 0x
Workqueue: phy0 ieee80211_ba_session_work [mac80211]
Call Trace:
  ? __schedule+0x174/0x5b0
  ? schedule+0x31/0x80
  ? schedule_preempt_disabled+0x9/0x10
  ? __mutex_lock.isra.2+0x163/0x480
  ? select_task_rq_fair+0xb9f/0xc60
  ? __ieee80211_start_rx_ba_session+0x135/0x4d0 [mac80211]
  ? __ieee80211_start_rx_ba_session+0x135/0x4d0 [mac80211]

Yeah - obviously as Stefano found, both take >ampdu_mlme.mtx.

Can you try this?

diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index 2b36eff5d97e..d8d32776031e 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -245,10 +245,10 @@ static void ieee80211_send_addba_resp(struct 
ieee80211_sub_if_data *sdata, u8 *d
 ieee80211_tx_skb(sdata, skb);
  }

-void __ieee80211_start_rx_ba_session(struct sta_info *sta,
-u8 dialog_token, u16 timeout,
-u16 start_seq_num, u16 ba_policy, u16 tid,
-u16 buf_size, bool tx, bool auto_seq)
+void ___ieee80211_start_rx_ba_session(struct sta_info *sta,
+ u8 dialog_token, u16 timeout,
+ u16 start_seq_num, u16 ba_policy, u16 tid,
+ u16 buf_size, bool tx, bool auto_seq)
  {
 struct ieee80211_local *local = sta->sdata->local;
 struct tid_ampdu_rx *tid_agg_rx;
@@ -267,7 +267,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
 ht_dbg(sta->sdata,
"STA %pM requests BA session on unsupported tid %d\n",
sta->sta.addr, tid);
-   goto end_no_lock;
+   goto end;
 }

 if (!sta->sta.ht_cap.ht_supported) {
@@ -275,14 +275,14 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
"STA %pM erroneously requests BA session on tid %d w/o 
QoS\n",
sta->sta.addr, tid);
 /* send a response anyway, it's an error case if we get here */
-   goto end_no_lock;
+   goto end;
 }

 if (test_sta_flag(sta, WLAN_STA_BLOCK_BA)) {
 ht_dbg(sta->sdata,
"Suspend in progress - Denying ADDBA request (%pM tid 
%d)\n",
sta->sta.addr, tid);
-   goto end_no_lock;
+   goto end;
 }

 /* sanity check for incoming parameters:
@@ -296,7 +296,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
 ht_dbg_ratelimited(sta->sdata,
"AddBA Req with bad params from %pM on tid %u. 
policy %d, buffer size %d\n",
sta->sta.addr, tid, ba_policy, buf_size);
-   goto end_no_lock;
+   goto end;
 }
 /* determine default buffer size */
 if (buf_size == 0)
@@ -311,7 +311,6 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
buf_size, sta->sta.addr);

 /* examine state machine */
-   mutex_lock(>ampdu_mlme.mtx);

 if (test_bit(tid, sta->ampdu_mlme.agg_session_valid)) {
 if (sta->ampdu_mlme.tid_rx_token[tid] == dialog_token) {
@@ -415,15 +414,25 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
 __clear_bit(tid, sta->ampdu_mlme.unexpected_agg);
 sta->ampdu_mlme.tid_rx_token[tid] = dialog_token;
 }
-   mutex_unlock(>ampdu_mlme.mtx);

-end_no_lock:
 if (tx)
 ieee80211_send_addba_resp(sta->sdata, sta->sta.addr, tid,
   dialog_token, status, 1, buf_size,
   timeout);
  }

+void __ieee80211_start_rx_ba_session(struct sta_info *sta,
+u8 dialog_token, u16 timeout,
+u16 start_seq_num, u16 ba_policy, u16 tid,
+u16 buf_size, bool tx, bool auto_seq)
+{
+   mutex_lock(>ampdu_mlme.mtx);
+   ___ieee80211_start_rx_ba_session(sta, dialog_token, timeout,
+start_seq_num, ba_policy, tid,
+buf_size, tx, auto_seq);
+   

Re: hung task in mac80211

2017-09-06 Thread Johannes Berg
On Wed, 2017-09-06 at 17:04 +0200, Matteo Croce wrote:
> 
> I confirm that this patch fixes the hang too.

Cool, I'll go apply it.

> I'm curious to see if there are noticeable performance differences
> between the two solutions.

Nope, you hit this code path essentially once.

johannes


Re: hung task in mac80211

2017-09-06 Thread Matteo Croce
On Wed, Sep 6, 2017 at 2:58 PM, Johannes Berg  wrote:
> On Wed, 2017-09-06 at 13:57 +0200, Matteo Croce wrote:
>
>> I have an hung task on vanilla 4.13 kernel which I haven't on 4.12.
>> The problem is present both on my AP and on my notebook,
>> so it seems it affects AP and STA mode as well.
>> The generated messages are:
>>
>> INFO: task kworker/u16:6:120 blocked for more than 120 seconds.
>>   Not tainted 4.13.0 #57
>> "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this
>> message.
>> kworker/u16:6   D0   120  2 0x
>> Workqueue: phy0 ieee80211_ba_session_work [mac80211]
>> Call Trace:
>>  ? __schedule+0x174/0x5b0
>>  ? schedule+0x31/0x80
>>  ? schedule_preempt_disabled+0x9/0x10
>>  ? __mutex_lock.isra.2+0x163/0x480
>>  ? select_task_rq_fair+0xb9f/0xc60
>>  ? __ieee80211_start_rx_ba_session+0x135/0x4d0 [mac80211]
>>  ? __ieee80211_start_rx_ba_session+0x135/0x4d0 [mac80211]
>
> Yeah - obviously as Stefano found, both take >ampdu_mlme.mtx.
>
> Can you try this?
>
> diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
> index 2b36eff5d97e..d8d32776031e 100644
> --- a/net/mac80211/agg-rx.c
> +++ b/net/mac80211/agg-rx.c
> @@ -245,10 +245,10 @@ static void ieee80211_send_addba_resp(struct 
> ieee80211_sub_if_data *sdata, u8 *d
> ieee80211_tx_skb(sdata, skb);
>  }
>
> -void __ieee80211_start_rx_ba_session(struct sta_info *sta,
> -u8 dialog_token, u16 timeout,
> -u16 start_seq_num, u16 ba_policy, u16 
> tid,
> -u16 buf_size, bool tx, bool auto_seq)
> +void ___ieee80211_start_rx_ba_session(struct sta_info *sta,
> + u8 dialog_token, u16 timeout,
> + u16 start_seq_num, u16 ba_policy, u16 
> tid,
> + u16 buf_size, bool tx, bool auto_seq)
>  {
> struct ieee80211_local *local = sta->sdata->local;
> struct tid_ampdu_rx *tid_agg_rx;
> @@ -267,7 +267,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
> ht_dbg(sta->sdata,
>"STA %pM requests BA session on unsupported tid %d\n",
>sta->sta.addr, tid);
> -   goto end_no_lock;
> +   goto end;
> }
>
> if (!sta->sta.ht_cap.ht_supported) {
> @@ -275,14 +275,14 @@ void __ieee80211_start_rx_ba_session(struct sta_info 
> *sta,
>"STA %pM erroneously requests BA session on tid %d w/o 
> QoS\n",
>sta->sta.addr, tid);
> /* send a response anyway, it's an error case if we get here 
> */
> -   goto end_no_lock;
> +   goto end;
> }
>
> if (test_sta_flag(sta, WLAN_STA_BLOCK_BA)) {
> ht_dbg(sta->sdata,
>"Suspend in progress - Denying ADDBA request (%pM tid 
> %d)\n",
>sta->sta.addr, tid);
> -   goto end_no_lock;
> +   goto end;
> }
>
> /* sanity check for incoming parameters:
> @@ -296,7 +296,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
> ht_dbg_ratelimited(sta->sdata,
>"AddBA Req with bad params from %pM on tid 
> %u. policy %d, buffer size %d\n",
>sta->sta.addr, tid, ba_policy, buf_size);
> -   goto end_no_lock;
> +   goto end;
> }
> /* determine default buffer size */
> if (buf_size == 0)
> @@ -311,7 +311,6 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
>buf_size, sta->sta.addr);
>
> /* examine state machine */
> -   mutex_lock(>ampdu_mlme.mtx);
>
> if (test_bit(tid, sta->ampdu_mlme.agg_session_valid)) {
> if (sta->ampdu_mlme.tid_rx_token[tid] == dialog_token) {
> @@ -415,15 +414,25 @@ void __ieee80211_start_rx_ba_session(struct sta_info 
> *sta,
> __clear_bit(tid, sta->ampdu_mlme.unexpected_agg);
> sta->ampdu_mlme.tid_rx_token[tid] = dialog_token;
> }
> -   mutex_unlock(>ampdu_mlme.mtx);
>
> -end_no_lock:
> if (tx)
> ieee80211_send_addba_resp(sta->sdata, sta->sta.addr, tid,
>   dialog_token, status, 1, buf_size,
>   timeout);
>  }
>
> +void __ieee80211_start_rx_ba_session(struct sta_info *sta,
> +u8 dialog_token, u16 timeout,
> +u16 start_seq_num, u16 ba_policy, u16 
> tid,
> +u16 buf_size, bool tx, bool auto_seq)
> +{
> +   mutex_lock(>ampdu_mlme.mtx);
> +   ___ieee80211_start_rx_ba_session(sta, dialog_token, timeout,
> +start_seq_num, ba_policy, 

Re: hung task in mac80211

2017-09-06 Thread Johannes Berg
On Wed, 2017-09-06 at 16:27 +0200, Stefano Brivio wrote:
> 
> Sorry for the extended bothering :) but here, you're extending quite
> a bit the scope of the lock also
> when__ieee80211_start_rx_ba_session() is called by
> ieee80211_process_addba_request().

I know, but it doesn't matter.

> No idea what the hit can be, but we can't safely assume it's

> nothing either.   

We don't really have to assume anything, we can read the code :-)

Trust me, I probably wrote most of it. It's fine, just sanity checks.

> What about simply introducing a 'ampdu_mlme_lock_held' argument
> instead?

Eww, no.

johannes



Re: hung task in mac80211

2017-09-06 Thread Stefano Brivio
On Wed, 06 Sep 2017 14:58:48 +0200
Johannes Berg  wrote:

> +void __ieee80211_start_rx_ba_session(struct sta_info *sta,
> +  u8 dialog_token, u16 timeout,
> +  u16 start_seq_num, u16 ba_policy, u16 tid,
> +  u16 buf_size, bool tx, bool auto_seq)
> +{
> + mutex_lock(>ampdu_mlme.mtx);
> + ___ieee80211_start_rx_ba_session(sta, dialog_token, timeout,
> +  start_seq_num, ba_policy, tid,
> +  buf_size, tx, auto_seq);
> + mutex_unlock(>ampdu_mlme.mtx);
> +}
> +

Sorry for the extended bothering :) but here, you're extending quite a
bit the scope of the lock also when__ieee80211_start_rx_ba_session() is
called by ieee80211_process_addba_request(). No idea what the hit can
be, but we can't safely assume it's nothing either. What about simply
introducing a 'ampdu_mlme_lock_held' argument instead? Something like:

diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index 2b36eff5d97e..35a9eff1ec66 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -248,7 +248,8 @@ static void ieee80211_send_addba_resp(struct 
ieee80211_sub_if_data *sdata, u8 *d
 void __ieee80211_start_rx_ba_session(struct sta_info *sta,
 u8 dialog_token, u16 timeout,
 u16 start_seq_num, u16 ba_policy, u16 tid,
-u16 buf_size, bool tx, bool auto_seq)
+u16 buf_size, bool tx, bool auto_seq,
+bool ampdu_mlme_lock_held)
 {
struct ieee80211_local *local = sta->sdata->local;
struct tid_ampdu_rx *tid_agg_rx;
@@ -311,7 +312,8 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
   buf_size, sta->sta.addr);
 
/* examine state machine */
-   mutex_lock(>ampdu_mlme.mtx);
+   if (!ampdu_mlme_lock_held)
+   mutex_lock(>ampdu_mlme.mtx);
 
if (test_bit(tid, sta->ampdu_mlme.agg_session_valid)) {
if (sta->ampdu_mlme.tid_rx_token[tid] == dialog_token) {
@@ -415,7 +417,8 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
__clear_bit(tid, sta->ampdu_mlme.unexpected_agg);
sta->ampdu_mlme.tid_rx_token[tid] = dialog_token;
}
-   mutex_unlock(>ampdu_mlme.mtx);
+   if (!ampdu_mlme_lock_held)
+   mutex_unlock(>ampdu_mlme.mtx);
 
 end_no_lock:
if (tx)
@@ -445,7 +448,7 @@ void ieee80211_process_addba_request(struct ieee80211_local 
*local,
 
__ieee80211_start_rx_ba_session(sta, dialog_token, timeout,
start_seq_num, ba_policy, tid,
-   buf_size, true, false);
+   buf_size, true, false, false);
 }
 
 void ieee80211_manage_rx_ba_offl(struct ieee80211_vif *vif,
diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c
index c92df492e898..59ba67e8942f 100644
--- a/net/mac80211/ht.c
+++ b/net/mac80211/ht.c
@@ -335,7 +335,7 @@ void ieee80211_ba_session_work(struct work_struct *work)
   sta->ampdu_mlme.tid_rx_manage_offl))
__ieee80211_start_rx_ba_session(sta, 0, 0, 0, 1, tid,
IEEE80211_MAX_AMPDU_BUF,
-   false, true);
+   false, true, true);
 
if (test_and_clear_bit(tid + IEEE80211_NUM_TIDS,
   sta->ampdu_mlme.tid_rx_manage_offl))
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 2197c62a0a6e..5d494ac65853 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1759,7 +1759,8 @@ void __ieee80211_stop_rx_ba_session(struct sta_info *sta, 
u16 tid,
 void __ieee80211_start_rx_ba_session(struct sta_info *sta,
 u8 dialog_token, u16 timeout,
 u16 start_seq_num, u16 ba_policy, u16 tid,
-u16 buf_size, bool tx, bool auto_seq);
+u16 buf_size, bool tx, bool auto_seq,
+bool ampdu_mlme_lock_held);
 void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta,
 enum ieee80211_agg_stop_reason reason);
 void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata,

-- 
Stefano


Re: hung task in mac80211

2017-09-06 Thread Stefano Brivio
On Wed, 06 Sep 2017 15:30:10 +0200
Johannes Berg  wrote:

> So for example replacing the loop of tid = 0..NUM_TIDS-1 with a
> list_for_each_entry() would already be unsafe with the dropping if the
> list were to require the mutex for locking.

Sure. Still, it would need another code change to break, but in general
I do agree indeed. :)


--
Stefano


Re: hung task in mac80211

2017-09-06 Thread Johannes Berg
On Wed, 2017-09-06 at 15:27 +0200, Stefano Brivio wrote:
> 
> Yes, that was based on the assumption that the initial part of
> __ieee80211_start_rx_ba_session() can't really affect the AMPDU
> state-machine in any way.

That's not really the point, if that changes that function would have
to move the locking around, and nothing else.

The point is more that code in ieee80211_ba_session_work() could assume
the lock is held across the entire loop, since that's the way it's
written and looks like even with your patch.

So for example replacing the loop of tid = 0..NUM_TIDS-1 with a
list_for_each_entry() would already be unsafe with the dropping if the
list were to require the mutex for locking.

johannes


Re: hung task in mac80211

2017-09-06 Thread Johannes Berg
On Wed, 2017-09-06 at 15:19 +0200, Stefano Brivio wrote:
> On Wed, 06 Sep 2017 14:48:35 +0200
> Johannes Berg  wrote:
> 
> > I'll look in a bit - but
> > 
> > > + mutex_unlock(>ampdu_mlme.mtx);
> > >   ___ieee80211_stop_rx_ba_session(
> > >   sta, tid, WLAN_BACK_RECIPIENT,
> > >   WLAN_REASON_QSTA_TIMEOUT,
> > > true);  
> > 
> > This already has three underscores so shouldn't drop.
> 
> Right, of course.
> 
> > [...]
> > > + mutex_unlock(>ampdu_mlme.mtx);
> > >   __ieee80211_start_rx_ba_session(sta, 0,
> > > 0,
> > > 0, 1, tid,  
> > 
> > maybe this one needs a ___ version then?
> 
> Either that, or as it's a single call, perhaps just the following?
> Matter of taste I guess...

I don't think it's a matter of taste - for me, in principle, dropping
locks for small sections of code where the larger section holds it is a
bug waiting to happen. It may (may, I don't even know) be OK here, but
in general it's something to avoid.

johannes


Re: hung task in mac80211

2017-09-06 Thread Stefano Brivio
On Wed, 06 Sep 2017 15:21:00 +0200
Johannes Berg  wrote:

> On Wed, 2017-09-06 at 15:19 +0200, Stefano Brivio wrote:
> > On Wed, 06 Sep 2017 14:48:35 +0200
> > Johannes Berg  wrote:
> >   
> > > I'll look in a bit - but
> > >   
> > > > +   mutex_unlock(>ampdu_mlme.mtx);
> > > >     ___ieee80211_stop_rx_ba_session(
> > > >     sta, tid, WLAN_BACK_RECIPIENT,
> > > >     WLAN_REASON_QSTA_TIMEOUT,
> > > > true);    
> > > 
> > > This already has three underscores so shouldn't drop.  
> > 
> > Right, of course.
> >   
> > > [...]  
> > > > +   mutex_unlock(>ampdu_mlme.mtx);
> > > >     __ieee80211_start_rx_ba_session(sta, 0,
> > > > 0,
> > > > 0, 1, tid,    
> > > 
> > > maybe this one needs a ___ version then?  
> > 
> > Either that, or as it's a single call, perhaps just the following?
> > Matter of taste I guess...  
> 
> I don't think it's a matter of taste - for me, in principle, dropping
> locks for small sections of code where the larger section holds it is a
> bug waiting to happen. It may (may, I don't even know) be OK here, but
> in general it's something to avoid.

Yes, that was based on the assumption that the initial part of
__ieee80211_start_rx_ba_session() can't really affect the AMPDU
state-machine in any way.

But sure, one small change there in the future and the assumption
doesn't hold anymore.


--
Stefano


Re: hung task in mac80211

2017-09-06 Thread Sebastian Gottschall

Am 06.09.2017 um 15:03 schrieb Johannes Berg:

On Wed, 2017-09-06 at 14:48 +0200, Johannes Berg wrote:

I'm surprised nobody saw this before - though perhaps Sebastian's
useless report is the same.

Oh, that's because this is only for the offloaded manager thing, and
that's only ath10k.

johannes

that explans the behaviour i have found with latest mac80211 and ath10k.


--
Mit freundlichen Grüssen / Regards

Sebastian Gottschall / CTO

NewMedia-NET GmbH - DD-WRT
Firmensitz:  Berliner Ring 101, 64625 Bensheim
Registergericht: Amtsgericht Darmstadt, HRB 25473
Geschäftsführer: Peter Steinhäuser, Christian Scheele
http://www.dd-wrt.com
email: s.gottsch...@dd-wrt.com
Tel.: +496251-582650 / Fax: +496251-5826565



Re: hung task in mac80211

2017-09-06 Thread Stefano Brivio
On Wed, 06 Sep 2017 14:48:35 +0200
Johannes Berg  wrote:

> I'll look in a bit - but
> 
> > +   mutex_unlock(>ampdu_mlme.mtx);
> >     ___ieee80211_stop_rx_ba_session(
> >     sta, tid, WLAN_BACK_RECIPIENT,
> >     WLAN_REASON_QSTA_TIMEOUT, true);  
> 
> This already has three underscores so shouldn't drop.

Right, of course.

> [...]
> > +   mutex_unlock(>ampdu_mlme.mtx);
> >     __ieee80211_start_rx_ba_session(sta, 0, 0,
> > 0, 1, tid,  
> 
> maybe this one needs a ___ version then?

Either that, or as it's a single call, perhaps just the following?
Matter of taste I guess...

diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c
index c92df492e898..377dd3c233d3 100644
--- a/net/mac80211/ht.c
+++ b/net/mac80211/ht.c
@@ -332,10 +332,13 @@ void ieee80211_ba_session_work(struct work_struct *work)
WLAN_REASON_UNSPECIFIED, true);
 
if (test_and_clear_bit(tid,
-  sta->ampdu_mlme.tid_rx_manage_offl))
+  sta->ampdu_mlme.tid_rx_manage_offl)) {
+   mutex_unlock(>ampdu_mlme.mtx);
__ieee80211_start_rx_ba_session(sta, 0, 0, 0, 1, tid,
IEEE80211_MAX_AMPDU_BUF,
false, true);
+   mutex_lock(>ampdu_mlme.mtx);
+   }
 
if (test_and_clear_bit(tid + IEEE80211_NUM_TIDS,
   sta->ampdu_mlme.tid_rx_manage_offl))
-- 
Stefano


Re: hung task in mac80211

2017-09-06 Thread Matteo Croce
On Wed, Sep 6, 2017 at 2:40 PM, Stefano Brivio  wrote:
> On Wed, 6 Sep 2017 13:57:47 +0200
> Matteo Croce  wrote:
>
>> Hi,
>>
>> I have an hung task on vanilla 4.13 kernel which I haven't on 4.12.
>> The problem is present both on my AP and on my notebook,
>> so it seems it affects AP and STA mode as well.
>> The generated messages are:
>>
>> INFO: task kworker/u16:6:120 blocked for more than 120 seconds.
>>   Not tainted 4.13.0 #57
>> "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>> kworker/u16:6   D0   120  2 0x
>> Workqueue: phy0 ieee80211_ba_session_work [mac80211]
>> Call Trace:
>>  ? __schedule+0x174/0x5b0
>>  ? schedule+0x31/0x80
>>  ? schedule_preempt_disabled+0x9/0x10
>>  ? __mutex_lock.isra.2+0x163/0x480
>>  ? select_task_rq_fair+0xb9f/0xc60
>>  ? __ieee80211_start_rx_ba_session+0x135/0x4d0 [mac80211]
>>  ? __ieee80211_start_rx_ba_session+0x135/0x4d0 [mac80211]
>
> This is ugly and maybe wrong, but you could check perhaps...:
>
> diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c
> index c92df492e898..bd7512a656f2 100644
> --- a/net/mac80211/ht.c
> +++ b/net/mac80211/ht.c
> @@ -320,28 +320,40 @@ void ieee80211_ba_session_work(struct work_struct *work)
>
> mutex_lock(>ampdu_mlme.mtx);
> for (tid = 0; tid < IEEE80211_NUM_TIDS; tid++) {
> -   if (test_and_clear_bit(tid, 
> sta->ampdu_mlme.tid_rx_timer_expired))
> +   if (test_and_clear_bit(tid, 
> sta->ampdu_mlme.tid_rx_timer_expired)) {
> +   mutex_unlock(>ampdu_mlme.mtx);
> ___ieee80211_stop_rx_ba_session(
> sta, tid, WLAN_BACK_RECIPIENT,
> WLAN_REASON_QSTA_TIMEOUT, true);
> +   mutex_lock(>ampdu_mlme.mtx);
> +   }
>
> if (test_and_clear_bit(tid,
> -  sta->ampdu_mlme.tid_rx_stop_requested))
> +  
> sta->ampdu_mlme.tid_rx_stop_requested)) {
> +   mutex_unlock(>ampdu_mlme.mtx);
> ___ieee80211_stop_rx_ba_session(
> sta, tid, WLAN_BACK_RECIPIENT,
> WLAN_REASON_UNSPECIFIED, true);
> +   mutex_lock(>ampdu_mlme.mtx);
> +   }
>
> if (test_and_clear_bit(tid,
> -  sta->ampdu_mlme.tid_rx_manage_offl))
> +  sta->ampdu_mlme.tid_rx_manage_offl)) {
> +   mutex_unlock(>ampdu_mlme.mtx);
> __ieee80211_start_rx_ba_session(sta, 0, 0, 0, 1, tid,
> 
> IEEE80211_MAX_AMPDU_BUF,
> false, true);
> +   mutex_lock(>ampdu_mlme.mtx);
> +   }
>
> if (test_and_clear_bit(tid + IEEE80211_NUM_TIDS,
> -  sta->ampdu_mlme.tid_rx_manage_offl))
> +  sta->ampdu_mlme.tid_rx_manage_offl)) {
> +   mutex_unlock(>ampdu_mlme.mtx);
> ___ieee80211_stop_rx_ba_session(
> sta, tid, WLAN_BACK_RECIPIENT,
> 0, false);
> +   mutex_lock(>ampdu_mlme.mtx);
> +   }
>
> spin_lock_bh(>lock);
>
> --
> Stefano
>

ACK, I have it running since 12 minutes.
The hang usually appears shortly after boot as I set
kernel.hung_task_timeout_secs=10

-- 
Matteo Croce
per aspera ad upstream


Re: hung task in mac80211

2017-09-06 Thread Johannes Berg
On Wed, 2017-09-06 at 14:48 +0200, Johannes Berg wrote:
> 
> I'm surprised nobody saw this before - though perhaps Sebastian's
> useless report is the same.

Oh, that's because this is only for the offloaded manager thing, and
that's only ath10k.

johannes


Re: hung task in mac80211

2017-09-06 Thread Johannes Berg
On Wed, 2017-09-06 at 13:57 +0200, Matteo Croce wrote:

> I have an hung task on vanilla 4.13 kernel which I haven't on 4.12.
> The problem is present both on my AP and on my notebook,
> so it seems it affects AP and STA mode as well.
> The generated messages are:
> 
> INFO: task kworker/u16:6:120 blocked for more than 120 seconds.
>   Not tainted 4.13.0 #57
> "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this
> message.
> kworker/u16:6   D0   120  2 0x
> Workqueue: phy0 ieee80211_ba_session_work [mac80211]
> Call Trace:
>  ? __schedule+0x174/0x5b0
>  ? schedule+0x31/0x80
>  ? schedule_preempt_disabled+0x9/0x10
>  ? __mutex_lock.isra.2+0x163/0x480
>  ? select_task_rq_fair+0xb9f/0xc60
>  ? __ieee80211_start_rx_ba_session+0x135/0x4d0 [mac80211]
>  ? __ieee80211_start_rx_ba_session+0x135/0x4d0 [mac80211]

Yeah - obviously as Stefano found, both take >ampdu_mlme.mtx.

Can you try this?

diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index 2b36eff5d97e..d8d32776031e 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -245,10 +245,10 @@ static void ieee80211_send_addba_resp(struct 
ieee80211_sub_if_data *sdata, u8 *d
ieee80211_tx_skb(sdata, skb);
 }
 
-void __ieee80211_start_rx_ba_session(struct sta_info *sta,
-u8 dialog_token, u16 timeout,
-u16 start_seq_num, u16 ba_policy, u16 tid,
-u16 buf_size, bool tx, bool auto_seq)
+void ___ieee80211_start_rx_ba_session(struct sta_info *sta,
+ u8 dialog_token, u16 timeout,
+ u16 start_seq_num, u16 ba_policy, u16 tid,
+ u16 buf_size, bool tx, bool auto_seq)
 {
struct ieee80211_local *local = sta->sdata->local;
struct tid_ampdu_rx *tid_agg_rx;
@@ -267,7 +267,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
ht_dbg(sta->sdata,
   "STA %pM requests BA session on unsupported tid %d\n",
   sta->sta.addr, tid);
-   goto end_no_lock;
+   goto end;
}
 
if (!sta->sta.ht_cap.ht_supported) {
@@ -275,14 +275,14 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
   "STA %pM erroneously requests BA session on tid %d w/o 
QoS\n",
   sta->sta.addr, tid);
/* send a response anyway, it's an error case if we get here */
-   goto end_no_lock;
+   goto end;
}
 
if (test_sta_flag(sta, WLAN_STA_BLOCK_BA)) {
ht_dbg(sta->sdata,
   "Suspend in progress - Denying ADDBA request (%pM tid 
%d)\n",
   sta->sta.addr, tid);
-   goto end_no_lock;
+   goto end;
}
 
/* sanity check for incoming parameters:
@@ -296,7 +296,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
ht_dbg_ratelimited(sta->sdata,
   "AddBA Req with bad params from %pM on tid 
%u. policy %d, buffer size %d\n",
   sta->sta.addr, tid, ba_policy, buf_size);
-   goto end_no_lock;
+   goto end;
}
/* determine default buffer size */
if (buf_size == 0)
@@ -311,7 +311,6 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
   buf_size, sta->sta.addr);
 
/* examine state machine */
-   mutex_lock(>ampdu_mlme.mtx);
 
if (test_bit(tid, sta->ampdu_mlme.agg_session_valid)) {
if (sta->ampdu_mlme.tid_rx_token[tid] == dialog_token) {
@@ -415,15 +414,25 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
__clear_bit(tid, sta->ampdu_mlme.unexpected_agg);
sta->ampdu_mlme.tid_rx_token[tid] = dialog_token;
}
-   mutex_unlock(>ampdu_mlme.mtx);
 
-end_no_lock:
if (tx)
ieee80211_send_addba_resp(sta->sdata, sta->sta.addr, tid,
  dialog_token, status, 1, buf_size,
  timeout);
 }
 
+void __ieee80211_start_rx_ba_session(struct sta_info *sta,
+u8 dialog_token, u16 timeout,
+u16 start_seq_num, u16 ba_policy, u16 tid,
+u16 buf_size, bool tx, bool auto_seq)
+{
+   mutex_lock(>ampdu_mlme.mtx);
+   ___ieee80211_start_rx_ba_session(sta, dialog_token, timeout,
+start_seq_num, ba_policy, tid,
+buf_size, tx, auto_seq);
+   mutex_unlock(>ampdu_mlme.mtx);
+}
+
 void ieee80211_process_addba_request(struct ieee80211_local *local,
 struct sta_info *sta,
 struct 

Re: hung task in mac80211

2017-09-06 Thread Johannes Berg
I'll look in a bit - but

> + mutex_unlock(>ampdu_mlme.mtx);
>   ___ieee80211_stop_rx_ba_session(
>   sta, tid, WLAN_BACK_RECIPIENT,
>   WLAN_REASON_QSTA_TIMEOUT, true);

This already has three underscores so shouldn't drop.

> 
> + mutex_unlock(>ampdu_mlme.mtx);
>   ___ieee80211_stop_rx_ba_session(

ditto

> + mutex_unlock(>ampdu_mlme.mtx);
>   __ieee80211_start_rx_ba_session(sta, 0, 0,
> 0, 1, tid,

maybe this one needs a ___ version then?

> + mutex_unlock(>ampdu_mlme.mtx);
>   ___ieee80211_stop_rx_ba_session(
> 
already ___

I'm surprised nobody saw this before - though perhaps Sebastian's
useless report is the same.

johannes


Re: hung task in mac80211

2017-09-06 Thread Stefano Brivio
On Wed, 6 Sep 2017 13:57:47 +0200
Matteo Croce  wrote:

> Hi,
> 
> I have an hung task on vanilla 4.13 kernel which I haven't on 4.12.
> The problem is present both on my AP and on my notebook,
> so it seems it affects AP and STA mode as well.
> The generated messages are:
> 
> INFO: task kworker/u16:6:120 blocked for more than 120 seconds.
>   Not tainted 4.13.0 #57
> "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> kworker/u16:6   D0   120  2 0x
> Workqueue: phy0 ieee80211_ba_session_work [mac80211]
> Call Trace:
>  ? __schedule+0x174/0x5b0
>  ? schedule+0x31/0x80
>  ? schedule_preempt_disabled+0x9/0x10
>  ? __mutex_lock.isra.2+0x163/0x480
>  ? select_task_rq_fair+0xb9f/0xc60
>  ? __ieee80211_start_rx_ba_session+0x135/0x4d0 [mac80211]
>  ? __ieee80211_start_rx_ba_session+0x135/0x4d0 [mac80211]

This is ugly and maybe wrong, but you could check perhaps...:

diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c
index c92df492e898..bd7512a656f2 100644
--- a/net/mac80211/ht.c
+++ b/net/mac80211/ht.c
@@ -320,28 +320,40 @@ void ieee80211_ba_session_work(struct work_struct *work)
 
mutex_lock(>ampdu_mlme.mtx);
for (tid = 0; tid < IEEE80211_NUM_TIDS; tid++) {
-   if (test_and_clear_bit(tid, 
sta->ampdu_mlme.tid_rx_timer_expired))
+   if (test_and_clear_bit(tid, 
sta->ampdu_mlme.tid_rx_timer_expired)) {
+   mutex_unlock(>ampdu_mlme.mtx);
___ieee80211_stop_rx_ba_session(
sta, tid, WLAN_BACK_RECIPIENT,
WLAN_REASON_QSTA_TIMEOUT, true);
+   mutex_lock(>ampdu_mlme.mtx);
+   }
 
if (test_and_clear_bit(tid,
-  sta->ampdu_mlme.tid_rx_stop_requested))
+  sta->ampdu_mlme.tid_rx_stop_requested)) {
+   mutex_unlock(>ampdu_mlme.mtx);
___ieee80211_stop_rx_ba_session(
sta, tid, WLAN_BACK_RECIPIENT,
WLAN_REASON_UNSPECIFIED, true);
+   mutex_lock(>ampdu_mlme.mtx);
+   }
 
if (test_and_clear_bit(tid,
-  sta->ampdu_mlme.tid_rx_manage_offl))
+  sta->ampdu_mlme.tid_rx_manage_offl)) {
+   mutex_unlock(>ampdu_mlme.mtx);
__ieee80211_start_rx_ba_session(sta, 0, 0, 0, 1, tid,
IEEE80211_MAX_AMPDU_BUF,
false, true);
+   mutex_lock(>ampdu_mlme.mtx);
+   }
 
if (test_and_clear_bit(tid + IEEE80211_NUM_TIDS,
-  sta->ampdu_mlme.tid_rx_manage_offl))
+  sta->ampdu_mlme.tid_rx_manage_offl)) {
+   mutex_unlock(>ampdu_mlme.mtx);
___ieee80211_stop_rx_ba_session(
sta, tid, WLAN_BACK_RECIPIENT,
0, false);
+   mutex_lock(>ampdu_mlme.mtx);
+   }
 
spin_lock_bh(>lock);
 
-- 
Stefano



Re: hung task in mac80211

2017-09-06 Thread Christian Lamparter
On Wednesday, September 6, 2017 1:57:47 PM CEST Matteo Croce wrote:
> Hi,
> 
> I have an hung task on vanilla 4.13 kernel which I haven't on 4.12.
> The problem is present both on my AP and on my notebook,
> so it seems it affects AP and STA mode as well.
> The generated messages are:
> 
> INFO: task kworker/u16:6:120 blocked for more than 120 seconds.
>   Not tainted 4.13.0 #57
> "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> kworker/u16:6   D0   120  2 0x
> Workqueue: phy0 ieee80211_ba_session_work [mac80211]
> Call Trace:
>  ? __schedule+0x174/0x5b0
>  ? schedule+0x31/0x80
>  ? schedule_preempt_disabled+0x9/0x10
>  ? __mutex_lock.isra.2+0x163/0x480
>  ? select_task_rq_fair+0xb9f/0xc60
>  ? __ieee80211_start_rx_ba_session+0x135/0x4d0 [mac80211]
>  ? __ieee80211_start_rx_ba_session+0x135/0x4d0 [mac80211]
>  ? try_to_wake_up+0x1f1/0x340
>  ? update_curr+0x88/0xd0
>  ? ieee80211_ba_session_work+0x148/0x230 [mac80211]
> 
> I did a bisect and the offending commit is:
> 
> commit 699cb58c8a52ff39bf659bff7971893ebe111bf2
> Author: Johannes Berg 
> Date:   Tue May 30 16:34:46 2017 +0200
> 
> mac80211: manage RX BA session offload without SKB queue

I looked at this briefly:

ieee80211_ba_session_work acquires:
mutex_lock(>ampdu_mlme.mtx) @


But it now also calls
__ieee80211_start_rx_ba_session() @
http://elixir.free-electrons.com/linux/v4.13/source/net/mac80211/ht.c#L336

which also wants to hold mutex_lock(>ampdu_mlme.mtx) in:
http://elixir.free-electrons.com/linux/v4.13/source/net/mac80211/agg-rx.c#L314

I guess this is where it deadlocks?

Regards,
Christian


hung task in mac80211

2017-09-06 Thread Matteo Croce
Hi,

I have an hung task on vanilla 4.13 kernel which I haven't on 4.12.
The problem is present both on my AP and on my notebook,
so it seems it affects AP and STA mode as well.
The generated messages are:

INFO: task kworker/u16:6:120 blocked for more than 120 seconds.
  Not tainted 4.13.0 #57
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
kworker/u16:6   D0   120  2 0x
Workqueue: phy0 ieee80211_ba_session_work [mac80211]
Call Trace:
 ? __schedule+0x174/0x5b0
 ? schedule+0x31/0x80
 ? schedule_preempt_disabled+0x9/0x10
 ? __mutex_lock.isra.2+0x163/0x480
 ? select_task_rq_fair+0xb9f/0xc60
 ? __ieee80211_start_rx_ba_session+0x135/0x4d0 [mac80211]
 ? __ieee80211_start_rx_ba_session+0x135/0x4d0 [mac80211]
 ? try_to_wake_up+0x1f1/0x340
 ? update_curr+0x88/0xd0
 ? ieee80211_ba_session_work+0x148/0x230 [mac80211]
 ? process_one_work+0x1a5/0x330
 ? worker_thread+0x42/0x3c0
 ? create_worker+0x170/0x170
 ? kthread+0x10d/0x130
 ? kthread_create_on_node+0x40/0x40
 ? ret_from_fork+0x22/0x30

I did a bisect and the offending commit is:

commit 699cb58c8a52ff39bf659bff7971893ebe111bf2
Author: Johannes Berg 
Date:   Tue May 30 16:34:46 2017 +0200

mac80211: manage RX BA session offload without SKB queue

Instead of using the SKB queue with the fake pkt_type for the
offloaded RX BA session management, also handle this with the
normal aggregation state machine worker. This also makes the
use of this more reliable since it gets rid of the allocation
of the fake skb.

Combined with the previous patch, this finally allows us to
get rid of the pkt_type hack entirely, so do that as well.

Signed-off-by: Johannes Berg 

Regards,
-- 
Matteo Croce
per aspera ad upstream