Re: [PATCH RT 4/6] rt/locking: Reenable migration accross schedule

2016-04-19 Thread Sebastian Andrzej Siewior
On 04/19/2016 10:55 AM, Mike Galbraith wrote:
> I can't get to my DL980 to do jitter testing atm (network outage), but
> wrt hotplug banging, my local boxen say patch is toxic.  i4790 desktop
> box silently bricked once too.  The boom begins with...
> 
>BUG: scheduling while atomic: futex_wait/11303/0x
> 
> ...very noisy funeral procession follows.

thanks for the feedback. My box survived overnight. Let me dig further
maybe I find something.

> 
>   -Mike
> 
Sebastian


Re: [PATCH RT 4/6] rt/locking: Reenable migration accross schedule

2016-04-19 Thread Mike Galbraith
On Tue, 2016-04-19 at 09:07 +0200, Sebastian Andrzej Siewior wrote:
> On 04/18/2016 07:55 PM, Mike Galbraith wrote:
> > 
> > I'll have to feed it to DL980, hotplug and jitter test it.  It seemed
> > to think that pinning post acquisition was a bad idea jitter wise, but
> > I was bending things up while juggling multiple boxen, so..
> 
> pinning pre acquisition could get you in a situation where you get the
> lock and you are stuck on CPU A where is also a task running right now
> with a higher priority while CPU B and CPU C are idle.

I can't get to my DL980 to do jitter testing atm (network outage), but
wrt hotplug banging, my local boxen say patch is toxic.  i4790 desktop
box silently bricked once too.  The boom begins with...

   BUG: scheduling while atomic: futex_wait/11303/0x

...very noisy funeral procession follows.

-Mike


Re: [PATCH RT 4/6] rt/locking: Reenable migration accross schedule

2016-04-19 Thread Sebastian Andrzej Siewior
On 04/18/2016 07:55 PM, Mike Galbraith wrote:
> 
> I'll have to feed it to DL980, hotplug and jitter test it.  It seemed
> to think that pinning post acquisition was a bad idea jitter wise, but
> I was bending things up while juggling multiple boxen, so..

pinning pre acquisition could get you in a situation where you get the
lock and you are stuck on CPU A where is also a task running right now
with a higher priority while CPU B and CPU C are idle.

> 
>   -Mike
> 
Sebastian


Re: [PATCH RT 4/6] rt/locking: Reenable migration accross schedule

2016-04-18 Thread Mike Galbraith
On Mon, 2016-04-18 at 19:15 +0200, Sebastian Andrzej Siewior wrote:

> take 2. There is this else case in pin_current_cpu() where I take
> hp_lock. I didn't manage to get in there. So I *think* we can get rid of
> the lock now. Since there is no lock (or will be) we can drop the whole
> `do_mig_dis' checking and do the migrate_disable() _after_ we obtained
> the lock. We were not able to do so due to the lock hp_lock.
> 
> And with this, I didn't manage to triger the lockup you had with
> futextest.

I'll have to feed it to DL980, hotplug and jitter test it.  It seemed
to think that pinning post acquisition was a bad idea jitter wise, but
I was bending things up while juggling multiple boxen, so..

-Mike


Re: [PATCH RT 4/6] rt/locking: Reenable migration accross schedule

2016-04-18 Thread Sebastian Andrzej Siewior
* Mike Galbraith | 2016-04-08 18:49:28 [+0200]:

>On Fri, 2016-04-08 at 16:51 +0200, Sebastian Andrzej Siewior wrote:
>
>> Is there anything you can hand me over?
>
>Sure, I'll send it offline (yup, that proud of my scripting;)
>
>   -Mike

take 2. There is this else case in pin_current_cpu() where I take
hp_lock. I didn't manage to get in there. So I *think* we can get rid of
the lock now. Since there is no lock (or will be) we can drop the whole
`do_mig_dis' checking and do the migrate_disable() _after_ we obtained
the lock. We were not able to do so due to the lock hp_lock.

And with this, I didn't manage to triger the lockup you had with
futextest.

diff --git a/include/linux/sched.h b/include/linux/sched.h
index f9a0f2b540f1..b0f786274025 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1492,7 +1492,7 @@ struct task_struct {
 #ifdef CONFIG_COMPAT_BRK
unsigned brk_randomized:1;
 #endif
-
+   unsigned mig_away :1;
unsigned long atomic_flags; /* Flags needing atomic access. */
 
struct restart_block restart_block;
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 8edd3c716092..3a1ee02ba3ab 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -30,6 +30,10 @@
 /* Serializes the updates to cpu_online_mask, cpu_present_mask */
 static DEFINE_MUTEX(cpu_add_remove_lock);
 
+static DEFINE_SPINLOCK(cpumask_lock);
+static cpumask_var_t mig_cpumask;
+static cpumask_var_t mig_cpumask_org;
+
 /*
  * The following two APIs (cpu_maps_update_begin/done) must be used when
  * attempting to serialize the updates to cpu_online_mask & cpu_present_mask.
@@ -120,6 +124,8 @@ struct hotplug_pcp {
 * state.
 */
spinlock_t lock;
+   cpumask_var_t cpumask;
+   cpumask_var_t cpumask_org;
 #else
struct mutex mutex;
 #endif
@@ -158,9 +164,30 @@ void pin_current_cpu(void)
return;
}
if (hp->grab_lock) {
+   int cpu;
+
+   cpu = smp_processor_id();
preempt_enable();
-   hotplug_lock(hp);
-   hotplug_unlock(hp);
+   if (cpu != raw_smp_processor_id())
+   goto retry;
+
+   current->mig_away = 1;
+   rt_spin_lock__no_mg(&cpumask_lock);
+
+   /* DOWN */
+   cpumask_copy(mig_cpumask_org, tsk_cpus_allowed(current));
+   cpumask_andnot(mig_cpumask, cpu_online_mask, cpumask_of(cpu));
+   set_cpus_allowed_ptr(current, mig_cpumask);
+
+   if (cpu == raw_smp_processor_id()) {
+   /* BAD */
+   hotplug_lock(hp);
+   hotplug_unlock(hp);
+   }
+   set_cpus_allowed_ptr(current, mig_cpumask_org);
+   current->mig_away = 0;
+   rt_spin_unlock__no_mg(&cpumask_lock);
+
} else {
preempt_enable();
/*
@@ -800,7 +827,13 @@ static struct notifier_block smpboot_thread_notifier = {
 
 void smpboot_thread_init(void)
 {
+   bool ok;
+
register_cpu_notifier(&smpboot_thread_notifier);
+   ok = alloc_cpumask_var(&mig_cpumask, GFP_KERNEL);
+   BUG_ON(!ok);
+   ok = alloc_cpumask_var(&mig_cpumask_org, GFP_KERNEL);
+   BUG_ON(!ok);
 }
 
 /* Requires cpu_add_remove_lock to be held */
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 66971005cc12..b5e5e6a15278 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -930,13 +930,13 @@ static inline void rt_spin_lock_fastlock(struct rt_mutex 
*lock,
 {
might_sleep_no_state_check();
 
-   if (do_mig_dis)
+   if (do_mig_dis && 0)
migrate_disable();
 
if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
rt_mutex_deadlock_account_lock(lock, current);
else
-   slowfn(lock, do_mig_dis);
+   slowfn(lock, false);
 }
 
 static inline void rt_spin_lock_fastunlock(struct rt_mutex *lock,
@@ -1125,12 +1125,14 @@ void __lockfunc rt_spin_lock(spinlock_t *lock)
 {
rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, true);
spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+   migrate_disable();
 }
 EXPORT_SYMBOL(rt_spin_lock);
 
 void __lockfunc __rt_spin_lock(struct rt_mutex *lock)
 {
rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock, true);
+   migrate_disable();
 }
 EXPORT_SYMBOL(__rt_spin_lock);
 
@@ -1145,6 +1147,7 @@ void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int 
subclass)
 {
spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, true);
+   migrate_disable();
 }
 EXPORT_SYMBOL(rt_spin_lock_nested);
 #endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index da96d97f3d79..0eb7496870bd 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3369,6 +3369,9 @@ static inline void sched_submit_work(struct task_stru

Re: [PATCH RT 4/6] rt/locking: Reenable migration accross schedule

2016-04-08 Thread Mike Galbraith
On Fri, 2016-04-08 at 16:51 +0200, Sebastian Andrzej Siewior wrote:

> Is there anything you can hand me over?

Sure, I'll send it offline (yup, that proud of my scripting;)

-Mike


Re: [PATCH RT 4/6] rt/locking: Reenable migration accross schedule

2016-04-08 Thread Sebastian Andrzej Siewior
On 04/08/2016 04:16 PM, Mike Galbraith wrote:
>> okay. and how did you trigger this? Just Steven's script or was there
>> more to it?
> 
> I run stockfish, futextest, hackbench and tbench with it, terminating
> and restarting them at random intervals just to make sure nobody gets
> into a comfortable little rut.  Stockfish and tbench are sized as to
> not saturate the box, hackbench runs periodically (and with no args to
> turn it into a hog), futextest run.sh just does its normal thing.

Is there anything you can hand me over?

> Trying to grab an rtmutex while queued on an rtmutex... doesn't matter
> much if it's the lock that likes to deadlock us, or the one you added
> instead of making that blasted lock really really dead.

Yeah, doesn't look too good.

>   -Mike
> 
Sebastian


Re: [PATCH RT 4/6] rt/locking: Reenable migration accross schedule

2016-04-08 Thread Mike Galbraith
On Fri, 2016-04-08 at 15:58 +0200, Sebastian Andrzej Siewior wrote:
> On 04/08/2016 03:44 PM, Mike Galbraith wrote:
> > On Thu, 2016-04-07 at 18:47 +0200, Sebastian Andrzej Siewior wrote:
> > 
> > > just to be clear: The patch I attached did _not_ work for you.
> > 
> > Did you perchance mean with "Reenable migration across schedule"
> > reverted?  Figured it would still explode in seconds.. it did.
> 
> I meant 4.4.6-rt13 + my patch and nothing else.
> 
> > [  172.996232] kernel BUG at kernel/locking/rtmutex.c:1360!
> 
> okay. and how did you trigger this? Just Steven's script or was there
> more to it?

I run stockfish, futextest, hackbench and tbench with it, terminating
and restarting them at random intervals just to make sure nobody gets
into a comfortable little rut.  Stockfish and tbench are sized as to
not saturate the box, hackbench runs periodically (and with no args to
turn it into a hog), futextest run.sh just does its normal thing.

Trying to grab an rtmutex while queued on an rtmutex... doesn't matter
much if it's the lock that likes to deadlock us, or the one you added
instead of making that blasted lock really really dead.

-Mike


Re: [PATCH RT 4/6] rt/locking: Reenable migration accross schedule

2016-04-08 Thread Sebastian Andrzej Siewior
On 04/08/2016 03:44 PM, Mike Galbraith wrote:
> On Thu, 2016-04-07 at 18:47 +0200, Sebastian Andrzej Siewior wrote:
> 
>> just to be clear: The patch I attached did _not_ work for you.
> 
> Did you perchance mean with "Reenable migration across schedule"
> reverted?  Figured it would still explode in seconds.. it did.

I meant 4.4.6-rt13 + my patch and nothing else.

> [  172.996232] kernel BUG at kernel/locking/rtmutex.c:1360!

okay. and how did you trigger this? Just Steven's script or was there
more to it?

Sebastian



Re: [PATCH RT 4/6] rt/locking: Reenable migration accross schedule

2016-04-08 Thread Mike Galbraith
On Thu, 2016-04-07 at 18:47 +0200, Sebastian Andrzej Siewior wrote:

> just to be clear: The patch I attached did _not_ work for you.

Did you perchance mean with "Reenable migration across schedule"
reverted?  Figured it would still explode in seconds.. it did.

[  172.996232] kernel BUG at kernel/locking/rtmutex.c:1360!
[  172.996234] invalid opcode:  [#1] PREEMPT SMP 
[  172.996236] Dumping ftrace buffer:
[  172.996239](ftrace buffer empty)
[  172.996254] Modules linked in: ebtable_filter(E) ebtables(E) fuse(E) 
nf_log_ipv6(E) xt_pkttype(E) xt_physdev(E) br_netfilter(E) nf_log_ipv4(E) 
nf_log_common(E) xt_LOG(E) xt_limit(E) af_packet(E) bridge(E) stp(E) llc(E) 
iscsi_ibft(E) iscsi_boot_sysfs(E) ip6t_REJECT(E) xt_tcpudp(E) 
nf_conntrack_ipv6(E) nf_defrag_ipv6(E) ip6table_raw(E) ipt_REJECT(E) 
iptable_raw(E) xt_CT(E) iptable_filter(E) ip6table_mangle(E) 
nf_conntrack_netbios_ns(E) nf_conntrack_broadcast(E) nf_conntrack_ipv4(E) 
nf_defrag_ipv4(E) ip_tables(E) xt_conntrack(E) nf_conntrack(E) 
ip6table_filter(E) ip6_tables(E) x_tables(E) nls_iso8859_1(E) nls_cp437(E) 
vfat(E) fat(E) intel_rapl(E) intel_powerclamp(E) coretemp(E) kvm_intel(E) 
kvm(E) irqbypass(E) crct10dif_pclmul(E) crc32_pclmul(E) crc32c_intel(E) 
snd_hda_codec_hdmi(E) snd_hda_codec_realtek(E)
[  172.996271]  snd_hda_codec_generic(E) drbg(E) snd_hda_intel(E) ansi_cprng(E) 
snd_hda_codec(E) snd_hda_core(E) snd_hwdep(E) aesni_intel(E) snd_pcm(E) 
aes_x86_64(E) lrw(E) r8169(E) mii(E) snd_timer(E) gf128mul(E) dm_mod(E) 
iTCO_wdt(E) iTCO_vendor_support(E) lpc_ich(E) mei_me(E) shpchp(E) snd(E) 
i2c_i801(E) joydev(E) pcspkr(E) serio_raw(E) glue_helper(E) ablk_helper(E) 
mei(E) mfd_core(E) cryptd(E) soundcore(E) nfsd(E) auth_rpcgss(E) nfs_acl(E) 
lockd(E) grace(E) processor(E) thermal(E) battery(E) fan(E) tpm_infineon(E) 
fjes(E) intel_smartconnect(E) sunrpc(E) efivarfs(E) ext4(E) crc16(E) mbcache(E) 
jbd2(E) sr_mod(E) cdrom(E) sd_mod(E) hid_logitech_hidpp(E) hid_logitech_dj(E) 
hid_generic(E) uas(E) usb_storage(E) usbhid(E) nouveau(E) wmi(E) 
i2c_algo_bit(E) drm_kms_helper(E) syscopyarea(E) sysfillrect(E)
[  172.996275]  ahci(E) sysimgblt(E) fb_sys_fops(E) libahci(E) ttm(E) libata(E) 
drm(E) video(E) button(E) sg(E) scsi_mod(E) autofs4(E)
[  172.996277] CPU: 7 PID: 6109 Comm: futex_wait Tainted: GE   
4.4.6-rt13-virgin #12
[  172.996277] Hardware name: MEDION MS-7848/MS-7848, BIOS M7848W08.20C 
09/23/2013
[  172.996278] task: 88017ce6ab80 ti: 8803d2e2 task.ti: 
8803d2e2
[  172.996283] RIP: 0010:[]  [] 
task_blocks_on_rt_mutex+0x243/0x260
[  172.996284] RSP: 0018:8803d2e23a38  EFLAGS: 00010092
[  172.996285] RAX: 8803d2e23c10 RBX: 88017ce6ab80 RCX: 
[  172.996285] RDX: 0001 RSI: 8803d2e23a98 RDI: 88017ce6b258
[  172.996286] RBP: 8803d2e23a68 R08: 8800dddc R09: 81f33918
[  172.996286] R10: 8800dddc0001 R11:  R12: 8800dddc
[  172.996287] R13: 8803d2e23a98 R14: 81f33900 R15: 
[  172.996288] FS:  7f4017988700() GS:88041edc() 
knlGS:
[  172.996288] CS:  0010 DS:  ES:  CR0: 80050033
[  172.996289] CR2:  CR3: 0003bf7f4000 CR4: 001406e0
[  172.996289] Stack:
[  172.996291]  7ce6abe8 81f33900 8803d2e23a98 

[  172.996292]    8803d2e23b08 
8162f105
[  172.996293]  0002 0296  
8803d2e23ae8
[  172.996293] Call Trace:
[  172.996298]  [] rt_mutex_slowlock+0xe5/0x290
[  172.996301]  [] ? pick_next_entity+0xa5/0x160
[  172.996303]  [] rt_mutex_lock+0x31/0x40
[  172.996304]  [] _mutex_lock+0xe/0x10
[  172.996306]  [] migrate_me+0x63/0x1f0
[  172.996308]  [] ? finish_task_switch+0x7d/0x300
[  172.996310]  [] pin_current_cpu+0x1e5/0x2a0
[  172.996311]  [] migrate_disable+0x73/0xd0
[  172.996313]  [] rt_spin_lock_slowlock+0x1e8/0x2e0
[  172.996314]  [] rt_spin_lock+0x38/0x40
[  172.996317]  [] futex_wait_setup+0x98/0x100
[  172.996318]  [] futex_wait+0x14f/0x240
[  172.996320]  [] ? rt_mutex_dequeue_pi+0x36/0x60
[  172.996322]  [] ? rt_mutex_adjust_prio+0x36/0x40
[  172.996323]  [] ? rt_spin_lock_slowunlock+0x84/0xc0
[  172.996325]  [] do_futex+0xd1/0x560
[  172.996327]  [] ? __switch_to+0x1d6/0x450
[  172.996329]  [] ? finish_task_switch+0x7d/0x300
[  172.996330]  [] ? __schedule+0x2ae/0x7d0
[  172.996332]  [] SyS_futex+0x71/0x150
[  172.996334]  [] ? exit_to_usermode_loop+0x4b/0xe4
[  172.996335]  [] entry_SYSCALL_64_fastpath+0x12/0x71
[  172.996349] Code: 0d 1b 54 f5 7e 74 30 65 48 8b 04 25 c4 28 01 00 48 8b 80 
08 c0 ff ff f6 c4 02 75 1b b8 f5 ff ff ff e9 25 ff ff ff e8 8d f5 ff ff <0f> 0b 
e8 d6 b5 f4 ff e9 0e ff ff ff e8 cc b5 f4 ff b8 f5 ff ff 
[  172.996351] RIP  [] task_blocks_on_rt_mutex+0x243/0x260
[  172.996351]  RSP 


Re: [PATCH RT 4/6] rt/locking: Reenable migration accross schedule

2016-04-08 Thread Mike Galbraith
On Fri, 2016-04-08 at 12:30 +0200, Sebastian Andrzej Siewior wrote:
> On 04/07/2016 09:04 PM, Mike Galbraith wrote:
> > > just to be clear: The patch I attached did _not_ work for you.
> > 
> > Sorry, I didn't test.  Marathon stress test session convinced me that
> > the lock added by -rt absolutely had to die.
> 
> Okay. And the patch did that. I removed the lock.

But also adds when it appears no addition is required.  I don't care
how it dies though, only that it does.

> I see. So what I don't like are all the exceptions you have: one for
> RCU and one kernfs. There might come more in the future. So what I aim
> is the removal of the lock.

Yes, those two were bandaids to allow searching for more -rt specific
disease (none found).  Removing that lock is the cure.

-Mike


Re: [PATCH RT 4/6] rt/locking: Reenable migration accross schedule

2016-04-08 Thread Sebastian Andrzej Siewior
On 04/07/2016 09:04 PM, Mike Galbraith wrote:
>> just to be clear: The patch I attached did _not_ work for you.
> 
> Sorry, I didn't test.  Marathon stress test session convinced me that
> the lock added by -rt absolutely had to die.

Okay. And the patch did that. I removed the lock.

>>> If that lock dies, we can unpin when entering lock slow path and pin
>>> again post acquisition with no ABBA worries as well, and not only does
>>> existing hotplug work heaping truckloads better, -rt can perhaps help
>>> spot trouble as the rewrite proceeds.
>>>
>>> Current state is more broken than ever.. if that's possible.
>>
>> And the two patches you attached here did?
> 
> I've killed way too many NOPREEMPT kernels to make any rash -rt claims.
>  What I can tell you is that my 64 core DL980 running 4.6-rc2-rt13 plus
> the two posted patches survived for ~20 hours before I had to break it
> off because I needed the box. 
> 
> These two haven't been through _as_ much pounding as the two targeted
> bandaids I showed have, but have been through quite a bit.  Other folks
> beating the living crap outta their boxen too would not be a bad idea.

I see. So what I don't like are all the exceptions you have: one for
RCU and one kernfs. There might come more in the future. So what I aim
is the removal of the lock.

> 
>   -Mike
> 
Sebastian


Re: [PATCH RT 4/6] rt/locking: Reenable migration accross schedule

2016-04-07 Thread Mike Galbraith
On Thu, 2016-04-07 at 18:47 +0200, Sebastian Andrzej Siewior wrote:

> > If that lock dies, we can unpin when entering lock slow path and pin
> > again post acquisition with no ABBA worries as well, and not only does
> > existing hotplug work heaping truckloads better, -rt can perhaps help
> > spot trouble as the rewrite proceeds.
> > 
> > Current state is more broken than ever.. if that's possible.
> 
> And the two patches you attached here did?

Re-reading your question, no, the only troubles I encountered were the
rt specific woes previously identified.  So the thought that started me
down this path turned up jack-diddly-spit.. but that's not a bad thing,
so I don't consider it to have been a waste of time.

-Mike


Re: [PATCH RT 4/6] rt/locking: Reenable migration accross schedule

2016-04-07 Thread Mike Galbraith
On Thu, 2016-04-07 at 18:47 +0200, Sebastian Andrzej Siewior wrote:
> On 04/02/2016 05:12 AM, Mike Galbraith wrote:
> > > By the time I improved hotplug I played with this. I had a few ideas but
> > > it didn't fly in the end. Today however I ended up with this:
> > 
> > Yeah, but that fails the duct tape test too.  Mine is below, and is the
> > extra sticky variety ;-)  With busted 0299 patch reverted and those two
> > applied, my DL980 took a beating for ~36 hours before I aborted it.. ie
> > hotplug road seemingly has no more -rt specific potholes.
> 
> just to be clear: The patch I attached did _not_ work for you.

Sorry, I didn't test.  Marathon stress test session convinced me that
the lock added by -rt absolutely had to die.

> > If that lock dies, we can unpin when entering lock slow path and pin
> > again post acquisition with no ABBA worries as well, and not only does
> > existing hotplug work heaping truckloads better, -rt can perhaps help
> > spot trouble as the rewrite proceeds.
> > 
> > Current state is more broken than ever.. if that's possible.
> 
> And the two patches you attached here did?

I've killed way too many NOPREEMPT kernels to make any rash -rt claims.
 What I can tell you is that my 64 core DL980 running 4.6-rc2-rt13 plus
the two posted patches survived for ~20 hours before I had to break it
off because I needed the box. 

These two haven't been through _as_ much pounding as the two targeted
bandaids I showed have, but have been through quite a bit.  Other folks
beating the living crap outta their boxen too would not be a bad idea.

-Mike


Re: [PATCH RT 4/6] rt/locking: Reenable migration accross schedule

2016-04-07 Thread Sebastian Andrzej Siewior
On 04/02/2016 05:12 AM, Mike Galbraith wrote:
>> By the time I improved hotplug I played with this. I had a few ideas but
>> it didn't fly in the end. Today however I ended up with this:
> 
> Yeah, but that fails the duct tape test too.  Mine is below, and is the
> extra sticky variety ;-)  With busted 0299 patch reverted and those two
> applied, my DL980 took a beating for ~36 hours before I aborted it.. ie
> hotplug road seemingly has no more -rt specific potholes.

just to be clear: The patch I attached did _not_ work for you.

> If that lock dies, we can unpin when entering lock slow path and pin
> again post acquisition with no ABBA worries as well, and not only does
> existing hotplug work heaping truckloads better, -rt can perhaps help
> spot trouble as the rewrite proceeds.
> 
> Current state is more broken than ever.. if that's possible.

And the two patches you attached here did?

> 
>   -Mike

Sebastian



Re: [PATCH RT 4/6] rt/locking: Reenable migration accross schedule

2016-04-01 Thread Mike Galbraith
On Fri, 2016-04-01 at 23:11 +0200, Sebastian Andrzej Siewior wrote:
> * Mike Galbraith | 2016-03-31 08:31:43 [+0200]:
> 
> > 3. nuke irksome grab_lock: make everybody always try to get the hell
> > outta Dodge or hotplug can bloody well wait.
> > 
> > I haven't yet flogged my 64 core box doing that, but my local boxen
> > seem to be saying we don't really really need the grab_lock business.
> > 
> > Are my boxen fibbing, is that very attractive looking door #3 a trap?
> 
> By the time I improved hotplug I played with this. I had a few ideas but
> it didn't fly in the end. Today however I ended up with this:

Yeah, but that fails the duct tape test too.  Mine is below, and is the
extra sticky variety ;-)  With busted 0299 patch reverted and those two
applied, my DL980 took a beating for ~36 hours before I aborted it.. ie
hotplug road seemingly has no more -rt specific potholes.

If that lock dies, we can unpin when entering lock slow path and pin
again post acquisition with no ABBA worries as well, and not only does
existing hotplug work heaping truckloads better, -rt can perhaps help
spot trouble as the rewrite proceeds.

Current state is more broken than ever.. if that's possible.

-Mike

hotplug/rt: Do not let pin_current_cpu() block RCU grace periods

Notifiers may depend upon grace periods continuing to advance
as blk_mq_queue_reinit_notify() below.

crash> bt 8803aee76400
PID: 1113   TASK: 8803aee76400  CPU: 0   COMMAND: "stress-cpu-hotp"
 #0 [880396fe7ad8] __schedule at 816b7142
 #1 [880396fe7b28] schedule at 816b797b
 #2 [880396fe7b48] blk_mq_freeze_queue_wait at 8135c5ac
 #3 [880396fe7b80] blk_mq_queue_reinit_notify at 8135f819
 #4 [880396fe7b98] notifier_call_chain at 8109b8ed
 #5 [880396fe7bd8] __raw_notifier_call_chain at 8109b91e
 #6 [880396fe7be8] __cpu_notify at 81072825
 #7 [880396fe7bf8] cpu_notify_nofail at 81072b15
 #8 [880396fe7c08] notify_dead at 81072d06
 #9 [880396fe7c38] cpuhp_invoke_callback at 81073718
#10 [880396fe7c78] cpuhp_down_callbacks at 81073a70
#11 [880396fe7cb8] _cpu_down at 816afc71
#12 [880396fe7d38] do_cpu_down at 8107435c
#13 [880396fe7d60] cpu_down at 81074390
#14 [880396fe7d70] cpu_subsys_offline at 814cd854
#15 [880396fe7d80] device_offline at 814c7cda
#16 [880396fe7da8] online_store at 814c7dd0
#17 [880396fe7dd0] dev_attr_store at 814c4fc8
#18 [880396fe7de0] sysfs_kf_write at 812cfbe4
#19 [880396fe7e08] kernfs_fop_write at 812cf172
#20 [880396fe7e50] __vfs_write at 81241428
#21 [880396fe7ed0] vfs_write at 81242535
#22 [880396fe7f10] sys_write at 812438f9
#23 [880396fe7f50] entry_SYSCALL_64_fastpath at 816bb4bc
RIP: 7fafd918acd0  RSP: 7ffd2ca956e8  RFLAGS: 0246
RAX: ffda  RBX: 0226a770  RCX: 7fafd918acd0
RDX: 0002  RSI: 7fafd9cb9000  RDI: 0001
RBP: 7ffd2ca95700   R8: 000a   R9: 7fafd9cb3700
R10:   R11: 0246  R12: 0007
R13: 0001  R14: 0009  R15: 000a
ORIG_RAX: 0001  CS: 0033  SS: 002b

blk_mq_queue_reinit_notify:
/*
 * We need to freeze and reinit all existing queues.  Freezing
 * involves synchronous wait for an RCU grace period and doing it
 * one by one may take a long time.  Start freezing all queues in
 * one swoop and then wait for the completions so that freezing can
 * take place in parallel.
 */
list_for_each_entry(q, &all_q_list, all_q_node)
blk_mq_freeze_queue_start(q);
list_for_each_entry(q, &all_q_list, all_q_node) {
blk_mq_freeze_queue_wait(q);

crash> bt 880176cc9900
PID: 17 TASK: 880176cc9900  CPU: 0   COMMAND: "rcu_sched"
 #0 [880176cd7ab8] __schedule at 816b7142
 #1 [880176cd7b08] schedule at 816b797b
 #2 [880176cd7b28] rt_spin_lock_slowlock at 816b974d
 #3 [880176cd7bc8] rt_spin_lock_fastlock at 811b0f3c
 #4 [880176cd7be8] rt_spin_lock__no_mg at 816bac1b
 #5 [880176cd7c08] pin_current_cpu at 8107406a
 #6 [880176cd7c50] migrate_disable at 810a0e9e
 #7 [880176cd7c70] rt_spin_lock at 816bad69
 #8 [880176cd7c90] lock_timer_base at 810fc5e8
 #9 [880176cd7cc8] try_to_del_timer_sync at 810fe290
#10 [880176cd7cf0] del_timer_sync at 810fe381
#11 [880176cd7d58] schedule_timeout at 816b9e4b
#12 [880176cd7df0] rcu_gp_kthread at 810f52b4
#13 [880176cd7e70] kthread at 8109a02f
#14 [880176cd7f50] ret_from_fork at 816bb6f2

Game Over.

Signed-off-by: Mike Galbraith 
---
 include

Re: [PATCH RT 4/6] rt/locking: Reenable migration accross schedule

2016-04-01 Thread Sebastian Andrzej Siewior
* Mike Galbraith | 2016-03-31 08:31:43 [+0200]:

>3. nuke irksome grab_lock: make everybody always try to get the hell
>outta Dodge or hotplug can bloody well wait.
>
>I haven't yet flogged my 64 core box doing that, but my local boxen
>seem to be saying we don't really really need the grab_lock business.
>
>Are my boxen fibbing, is that very attractive looking door #3 a trap?

By the time I improved hotplug I played with this. I had a few ideas but
it didn't fly in the end. Today however I ended up with this:

-- 

Subject: [PATCH] kernel: hotplug: migrate to another CPU if the current is
 going away

If you X is going down then every task running on that CPU must go away.
This is achieved by setting hp->grab_lock to force the task to block
on hp->lock and schedule away in migrate_disable() before the task is
pinned to the CPU.
The task blocks on lock until the CPU is down. If the task holds any
ressources (locks) that are required by one of the CPU notifier then we
will dead lock here.
One petential deadlock is blk_mq CPU notifier (blk_mq_queue_reinit_notify())
which becuase it waits for the RCU grace period to advance/complete
which is stuck on the hp->lock and won't make any progress.
Mike identified another candidate "thermal_throttle_cpu_callback() ->
kernfs_find_and_get_ns()" which blocks on kernfs_mutex that is held
by udev via load_module() and is blocks on hp->lock.

So instead of getting tasks off the CPU by blocking on a lock I attempt
to get off the CPU by masking it out from the CPU mask. The ->mig_away
flag is used to primary avoid a blk_flush_plug_list() invacation via
schedule() and dead lock on cpumask_lock() (because the first spinlock
will invoke migrate_disable()).

Signed-off-by: Sebastian Andrzej Siewior 
---
 include/linux/sched.h |  2 +-
 kernel/cpu.c  | 37 +++--
 kernel/sched/core.c   |  3 +++
 3 files changed, 39 insertions(+), 3 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 58c5ec8c3742..d0ba00b9aff4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1492,7 +1492,7 @@ struct task_struct {
 #ifdef CONFIG_COMPAT_BRK
unsigned brk_randomized:1;
 #endif
-
+   unsigned mig_away :1;
unsigned long atomic_flags; /* Flags needing atomic access. */
 
struct restart_block restart_block;
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 8edd3c716092..3a1ee02ba3ab 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -30,6 +30,10 @@
 /* Serializes the updates to cpu_online_mask, cpu_present_mask */
 static DEFINE_MUTEX(cpu_add_remove_lock);
 
+static DEFINE_SPINLOCK(cpumask_lock);
+static cpumask_var_t mig_cpumask;
+static cpumask_var_t mig_cpumask_org;
+
 /*
  * The following two APIs (cpu_maps_update_begin/done) must be used when
  * attempting to serialize the updates to cpu_online_mask & cpu_present_mask.
@@ -120,6 +124,8 @@ struct hotplug_pcp {
 * state.
 */
spinlock_t lock;
+   cpumask_var_t cpumask;
+   cpumask_var_t cpumask_org;
 #else
struct mutex mutex;
 #endif
@@ -158,9 +164,30 @@ void pin_current_cpu(void)
return;
}
if (hp->grab_lock) {
+   int cpu;
+
+   cpu = smp_processor_id();
preempt_enable();
-   hotplug_lock(hp);
-   hotplug_unlock(hp);
+   if (cpu != raw_smp_processor_id())
+   goto retry;
+
+   current->mig_away = 1;
+   rt_spin_lock__no_mg(&cpumask_lock);
+
+   /* DOWN */
+   cpumask_copy(mig_cpumask_org, tsk_cpus_allowed(current));
+   cpumask_andnot(mig_cpumask, cpu_online_mask, cpumask_of(cpu));
+   set_cpus_allowed_ptr(current, mig_cpumask);
+
+   if (cpu == raw_smp_processor_id()) {
+   /* BAD */
+   hotplug_lock(hp);
+   hotplug_unlock(hp);
+   }
+   set_cpus_allowed_ptr(current, mig_cpumask_org);
+   current->mig_away = 0;
+   rt_spin_unlock__no_mg(&cpumask_lock);
+
} else {
preempt_enable();
/*
@@ -800,7 +827,13 @@ static struct notifier_block smpboot_thread_notifier = {
 
 void smpboot_thread_init(void)
 {
+   bool ok;
+
register_cpu_notifier(&smpboot_thread_notifier);
+   ok = alloc_cpumask_var(&mig_cpumask, GFP_KERNEL);
+   BUG_ON(!ok);
+   ok = alloc_cpumask_var(&mig_cpumask_org, GFP_KERNEL);
+   BUG_ON(!ok);
 }
 
 /* Requires cpu_add_remove_lock to be held */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 94827a59301e..ab06be452eb7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3369,6 +3369,9 @@ static inline void sched_submit_work(struct task_struct 
*tsk)
 {
if (!tsk->state)
return;
+
+   if (tsk->mig_away)
+   return;
/*
 * If a worker went to s

Re: [PATCH RT 4/6] rt/locking: Reenable migration accross schedule

2016-03-30 Thread Mike Galbraith
On Thu, 2016-03-24 at 11:44 +0100, Thomas Gleixner wrote:

> I really wonder what makes the change. The only thing which comes to my mind
> is the enforcement of running the online and down_prepare callbacks on the
> plugged cpu instead of doing it wherever the scheduler decides to run it.

It seems it's not the state machinery making a difference after all,
the only two deadlocks encountered in oodles of beating seem to boil
down to the grab_lock business being a pistol aimed at our own toes.

1. kernfs_mutex taken during hotplug: We don't pin across mutex
acquisition, so anyone grabbing it and then calling migrate_disable()
while grab_lock is set renders us dead.  Pin across acquisition of that
specific mutex fixes that specific grab_lock instigated deadlock.

2. notifier dependency upon RCU GP threads:  Telling same to always do
migrate_me() or hotplug can bloody well wait fixes that specific
grab_lock instigated deadlock.

With those two little hacks, all of my boxen including DL980 just keep
on chugging away in 4.[456]-rt, showing zero inclination to identify
any more hotplug bandits.

What I like much better than 1 + 2 is their sum, which would generate
minus signs, my favorite thing in patches, and fix the two above and
anything that resembles them in any way...

3. nuke irksome grab_lock: make everybody always try to get the hell
outta Dodge or hotplug can bloody well wait.

I haven't yet flogged my 64 core box doing that, but my local boxen
seem to be saying we don't really really need the grab_lock business.

Are my boxen fibbing, is that very attractive looking door #3 a trap?

-Mike



Re: [PATCH RT 4/6] rt/locking: Reenable migration accross schedule

2016-03-28 Thread Mike Galbraith
On Fri, 2016-03-25 at 17:24 +0100, Mike Galbraith wrote:
> On Fri, 2016-03-25 at 10:13 +0100, Mike Galbraith wrote:
> > On Fri, 2016-03-25 at 09:52 +0100, Thomas Gleixner wrote:
> > > On Fri, 25 Mar 2016, Mike Galbraith wrote:
> > > > On Thu, 2016-03-24 at 12:06 +0100, Mike Galbraith wrote:
> > > > > On Thu, 2016-03-24 at 11:44 +0100, Thomas Gleixner wrote:
> > > > > >  
> > > > > > > On the bright side, with the busted migrate enable business 
> > > > > > > reverted,
> > > > > > > plus one dinky change from me [1], master-rt.today has completed 
> > > > > > > 100
> > > > > > > iterations of Steven's hotplug stress script along side endless
> > > > > > > futexstress, and is happily doing another 900 as I write this, so 
> > > > > > > the
> > > > > > > next -rt should finally be hotplug deadlock free.
> > > > > > > 
> > > > > > > Thomas's state machinery seems to work wonders.  'course this 
> > > > > > > being
> > > > > > > hotplug, the other shoe will likely apply itself to my backside 
> > > > > > > soon.
> > > > > > 
> > > > > > That's a given :)
> > > > > 
> > > > > blk-mq applied it shortly after I was satisfied enough to poke xmit.
> > > > 
> > > > The other shoe is that notifiers can depend upon RCU grace periods, so
> > > > when pin_current_cpu() snags rcu_sched, the hotplug game is over.
> > > > 
> > > > blk_mq_queue_reinit_notify:
> > > > /*
> > > >  * We need to freeze and reinit all existing queues.  Freezing
> > > >  * involves synchronous wait for an RCU grace period and doing 
> > > > it
> > > >  * one by one may take a long time.  Start freezing all queues 
> > > > in
> > > >  * one swoop and then wait for the completions so that freezing 
> > > > can
> > > >  * take place in parallel.
> > > >  */
> > > > list_for_each_entry(q, &all_q_list, all_q_node)
> > > > blk_mq_freeze_queue_start(q);
> > > > list_for_each_entry(q, &all_q_list, all_q_node) {
> > > > blk_mq_freeze_queue_wait(q);
> > > 
> > > Yeah, I stumbled over that already when analysing all the hotplug notifier
> > > sites. That's definitely a horrible one.
> > >  
> > > > Hohum (sharpens rock), next.
> > > 
> > > /me recommends frozen sharks
> > 
> > With the sharp rock below and the one I'll follow up with, master-rt on
> > my DL980 just passed 3 hours of endless hotplug stress concurrent with
> > endless tbench 8, stockfish and futextest.  It has never survived this
> > long with this load by a long shot.
> 
> I knew it was unlikely to surrender that quickly.  Oh well, on the
> bright side it seems to be running low on deadlocks.

The immunize rcu_sched rock did that btw.  Having accidentally whacked
the dump, I got to reproduce (took 30.03 hours) so I could analyze it.

Hohum, notifier woes definitely require somewhat sharper rocks.

I could make rcu_sched dodge the migration thread, but think I'll apply
frozen shark to blk-mq instead.

-Mike

(a clever person would wait for Sir Thomas, remaining blissfully
ignorant of the gory dragon slaying details, but whatever, premature
testing and rt mole whacking may turn up something interesting, ya
never know)


Re: [PATCH RT 4/6] rt/locking: Reenable migration accross schedule

2016-03-25 Thread Mike Galbraith
On Fri, 2016-03-25 at 10:13 +0100, Mike Galbraith wrote:
> On Fri, 2016-03-25 at 09:52 +0100, Thomas Gleixner wrote:
> > On Fri, 25 Mar 2016, Mike Galbraith wrote:
> > > On Thu, 2016-03-24 at 12:06 +0100, Mike Galbraith wrote:
> > > > On Thu, 2016-03-24 at 11:44 +0100, Thomas Gleixner wrote:
> > > > >  
> > > > > > On the bright side, with the busted migrate enable business 
> > > > > > reverted,
> > > > > > plus one dinky change from me [1], master-rt.today has completed 100
> > > > > > iterations of Steven's hotplug stress script along side endless
> > > > > > futexstress, and is happily doing another 900 as I write this, so 
> > > > > > the
> > > > > > next -rt should finally be hotplug deadlock free.
> > > > > > 
> > > > > > Thomas's state machinery seems to work wonders.  'course this being
> > > > > > hotplug, the other shoe will likely apply itself to my backside 
> > > > > > soon.
> > > > > 
> > > > > That's a given :)
> > > > 
> > > > blk-mq applied it shortly after I was satisfied enough to poke xmit.
> > > 
> > > The other shoe is that notifiers can depend upon RCU grace periods, so
> > > when pin_current_cpu() snags rcu_sched, the hotplug game is over.
> > > 
> > > blk_mq_queue_reinit_notify:
> > > /*
> > >  * We need to freeze and reinit all existing queues.  Freezing
> > >  * involves synchronous wait for an RCU grace period and doing it
> > >  * one by one may take a long time.  Start freezing all queues in
> > >  * one swoop and then wait for the completions so that freezing 
> > > can
> > >  * take place in parallel.
> > >  */
> > > list_for_each_entry(q, &all_q_list, all_q_node)
> > > blk_mq_freeze_queue_start(q);
> > > list_for_each_entry(q, &all_q_list, all_q_node) {
> > > blk_mq_freeze_queue_wait(q);
> > 
> > Yeah, I stumbled over that already when analysing all the hotplug notifier
> > sites. That's definitely a horrible one.
> >  
> > > Hohum (sharpens rock), next.
> > 
> > /me recommends frozen sharks
> 
> With the sharp rock below and the one I'll follow up with, master-rt on
> my DL980 just passed 3 hours of endless hotplug stress concurrent with
> endless tbench 8, stockfish and futextest.  It has never survived this
> long with this load by a long shot.

I knew it was unlikely to surrender that quickly.  Oh well, on the
bright side it seems to be running low on deadlocks.

Happy Easter,

-Mike

(bite me beast, 666 indeed)

[2.886077] [ cut here ]
[2.886078] kernel BUG at kernel/sched/core.c:1717!
[2.886081] invalid opcode:  [#1] PREEMPT SMP
[2.886094] Dumping ftrace buffer:
[2.886112](ftrace buffer empty)
[2.886137] Modules linked in: autofs4 edd af_packet cpufreq_conservative 
cpufreq_ondemand cpufreq_userspace cpufreq_powersave fuse loop md_mod dm_mod 
vhost_net macvtap macvlan vhost tun ipmi_ssif kvm_intel kvm joydev hid_generic 
sr_m
od cdrom sg shpchp netxen_nic hpwdt hpilo ipmi_si ipmi_msghandler irqbypass 
bnx2 iTCO_wdt iTCO_vendor_support gpio_ich pcc_cpufreq fjes i7core_edac 
edac_core lpc_ich pcspkr 8250_fintek ehci_pci acpi_cpufreq acpi_power_meter 
button ext4 m
bcache jbd2 crc16 usbhid uhci_hcd ehci_hcd sd_mod usbcore usb_common thermal 
processor scsi_dh_hp_sw scsi_dh_emc scsi_dh_rdac scsi_dh_alua ata_generic 
ata_piix libata hpsa scsi_transport_sas cciss scsi_mod
[2.886140] CPU: 2 PID: 41 Comm: migration/2 Not tainted 4.6.0-rt11 #69
[2.886140] Hardware name: Hewlett-Packard ProLiant DL980 G7, BIOS P66 
07/07/2010
[2.886142] task: 88017e34e580 ti: 88017e394000 task.ti: 
88017e394000
[2.886149] RIP: 0010:[]  [] 
select_fallback_rq+0x19c/0x1d0
[2.886149] RSP: 0018:88017e397d28  EFLAGS: 00010046
[2.886150] RAX: 0100 RBX: 88017e668348 RCX: 0003
[2.886151] RDX: 0100 RSI: 0100 RDI: 81811420
[2.886152] RBP: 88017e668000 R08: 0003 R09: 
[2.886153] R10: 8802772b3ec0 R11: 0001 R12: 0002
[2.886153] R13: 0002 R14: 88017e398000 R15: 88017e668000
[2.886155] FS:  () GS:88027668() 
knlGS:
[2.886156] CS:  0010 DS:  ES:  CR0: 80050033
[2.886156] CR2: 00695c5c CR3: 000271419000 CR4: 06e0
[2.886157] Stack:
[2.886159]  880276696900 880276696900 88017e668808 
00016900
[2.886160]  810a88f9 88017e398000 88017e398000 
0046
[2.886161]  88017e34e580 fff7 81c5be90 

[2.886162] Call Trace:
[2.886166]  [] ? migration_call+0x1b9/0x3b0
[2.886168]  [] ? notifier_call_chain+0x44/0x70
[2.886171]  [] ? notify_online+0x20/0x20
[2.886172]  [] ? __cpu_notify+0x31/0x50
[2.886173]  [] ? notify_dying+0x18/0x20
[2.

Re: [PATCH RT 4/6] rt/locking: Reenable migration accross schedule

2016-03-25 Thread Mike Galbraith
Rock #1..

hotplug/rt: Nest module_mutex inside cpu_hotplug.lock

PID: 11107  TASK: 8803b12b9900  CPU: 4   COMMAND: "stress-cpu-hotp" 

 
 #0 [88038b34f9b8] __schedule at 816b7132
 #1 [88038b34fa08] schedule at 816b796b
 #2 [88038b34fa28] rt_mutex_slowlock at 816b93ee
 #3 [88038b34fac8] rt_mutex_fastlock at 811b0e9d
 #4 [88038b34faf0] rt_mutex_lock at 816b95c8
 #5 [88038b34fb08] _mutex_lock at 816baf59
 #6 [88038b34fb28] kernfs_find_and_get_ns at 812cd573
 #7 [88038b34fb50] sysfs_remove_group at 812d100a
 #8 [88038b34fb78] thermal_throttle_cpu_callback at 81036ab9
 #9 [88038b34fb98] notifier_call_chain at 8109b8dd
#10 [88038b34fbd8] __raw_notifier_call_chain at 8109b90e
#11 [88038b34fbe8] __cpu_notify at 81072825
#12 [88038b34fbf8] cpu_notify_nofail at 81072b15
#13 [88038b34fc08] notify_dead at 81072d06
#14 [88038b34fc38] cpuhp_invoke_callback at 81073718
#15 [88038b34fc78] cpuhp_down_callbacks at 81073a70
#16 [88038b34fcb8] _cpu_down at 816afc61
#17 [88038b34fd38] do_cpu_down at 8107434c
#18 [88038b34fd60] cpu_down at 81074380
#19 [88038b34fd70] cpu_subsys_offline at 814cd844
#20 [88038b34fd80] device_offline at 814c7cca
#21 [88038b34fda8] online_store at 814c7dc0
#22 [88038b34fdd0] dev_attr_store at 814c4fb8
#23 [88038b34fde0] sysfs_kf_write at 812cfbd4
#24 [88038b34fe08] kernfs_fop_write at 812cf162
#25 [88038b34fe50] __vfs_write at 81241418
#26 [88038b34fed0] vfs_write at 81242525
#27 [88038b34ff10] sys_write at 812438e9
#28 [88038b34ff50] entry_SYSCALL_64_fastpath at 816bb4fc
RIP: 7f05f3d69cd0  RSP: 7ffdfc934468  RFLAGS: 0246
RAX: ffda  RBX: 01908770  RCX: 7f05f3d69cd0
RDX: 0002  RSI: 7f05f4898000  RDI: 0001
RBP: 7ffdfc934480   R8: 000a   R9: 7f05f4892700
R10:   R11: 0246  R12: 0007
R13: 0001  R14: 0009  R15: 000a
ORIG_RAX: 0001  CS: 0033  SS: 002b

stress-cpu-hotp blocks on kernfs_mutex, held by systemd-udevd..

crash> bt 8803b12bcb00
PID: 11130  TASK: 8803b12bcb00  CPU: 6   COMMAND: "systemd-udevd"
 #0 [88038b327a18] __schedule at 816b7132
 #1 [88038b327a68] schedule at 816b796b
 #2 [88038b327a88] rt_spin_lock_slowlock at 816b9750
 #3 [88038b327b30] rt_spin_lock_fastlock at 811b0f2c
 #4 [88038b327b50] rt_spin_lock__no_mg at 816bac7b
 #5 [88038b327b70] pin_current_cpu at 8107406a
 #6 [88038b327bb8] migrate_disable at 810a0e8e
 #7 [88038b327bd8] rt_spin_lock at 816badc9
 #8 [88038b327bf8] ida_simple_remove at 8138765c
 #9 [88038b327c18] kernfs_put at 812ccc58
#10 [88038b327c60] __kernfs_remove at 812cd15c
#11 [88038b327cc0] kernfs_remove_by_name_ns at 812ce2f3
#12 [88038b327ce8] sysfs_remove_link at 812d05e9
#13 [88038b327cf8] free_module at 8111c8f2
#14 [88038b327d30] do_init_module at 811b157f
#15 [88038b327d58] load_module at 8111f11b
#16 [88038b327e98] SYSC_finit_module at 8111faf9
#17 [88038b327f40] sys_finit_module at 8111fb3e
#18 [88038b327f50] entry_SYSCALL_64_fastpath at 816bb4fc
RIP: 7f75d9925f79  RSP: 7ffd1c040ed8  RFLAGS: 0246
RAX: ffda  RBX: 01d368e0  RCX: 7f75d9925f79
RDX:   RSI: 7f75da0233c1  RDI: 0008
RBP: 0008   R8:    R9: 01d39c82
R10: 0008  R11: 0246  R12: 7ffd1c03ff00
R13: 7ffd1c03fee0  R14: 0005  R15: 0aba9500
ORIG_RAX: 0139  CS: 0033  SS: 002b

..which stress-cpu-hotp has blocked via pin_current_cpu().  Game Over.

Signed-off-by: Mike Galbraith 
---
 kernel/cpu.c |9 +
 1 file changed, 9 insertions(+)

--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -23,6 +23,9 @@
 #include 
 #include 
 #include 
+#ifdef CONFIG_PREEMPT_RT_BASE
+#include 
+#endif
 
 #include "smpboot.h"
 
@@ -442,10 +445,16 @@ void cpu_hotplug_begin(void)
schedule();
}
finish_wait(&cpu_hotplug.wq, &wait);
+#ifdef CONFIG_PREEMPT_RT_BASE
+   mutex_lock(&module_mutex);
+#endif
 }
 
 void cpu_hotplug_done(void)
 {
+#ifdef CONFIG_PREEMPT_RT_BASE
+   mutex_unlock(&module_mutex);
+#endif
cpu_hotplug.active_writer = NULL;
mutex_unlock(&c

Re: [PATCH RT 4/6] rt/locking: Reenable migration accross schedule

2016-03-25 Thread Mike Galbraith
On Fri, 2016-03-25 at 09:52 +0100, Thomas Gleixner wrote:
> On Fri, 25 Mar 2016, Mike Galbraith wrote:
> > On Thu, 2016-03-24 at 12:06 +0100, Mike Galbraith wrote:
> > > On Thu, 2016-03-24 at 11:44 +0100, Thomas Gleixner wrote:
> > > >  
> > > > > On the bright side, with the busted migrate enable business reverted,
> > > > > plus one dinky change from me [1], master-rt.today has completed 100
> > > > > iterations of Steven's hotplug stress script along side endless
> > > > > futexstress, and is happily doing another 900 as I write this, so the
> > > > > next -rt should finally be hotplug deadlock free.
> > > > > 
> > > > > Thomas's state machinery seems to work wonders.  'course this being
> > > > > hotplug, the other shoe will likely apply itself to my backside soon.
> > > > 
> > > > That's a given :)
> > > 
> > > blk-mq applied it shortly after I was satisfied enough to poke xmit.
> > 
> > The other shoe is that notifiers can depend upon RCU grace periods, so
> > when pin_current_cpu() snags rcu_sched, the hotplug game is over.
> > 
> > blk_mq_queue_reinit_notify:
> > /*
> >  * We need to freeze and reinit all existing queues.  Freezing
> >  * involves synchronous wait for an RCU grace period and doing it
> >  * one by one may take a long time.  Start freezing all queues in
> >  * one swoop and then wait for the completions so that freezing can
> >  * take place in parallel.
> >  */
> > list_for_each_entry(q, &all_q_list, all_q_node)
> > blk_mq_freeze_queue_start(q);
> > list_for_each_entry(q, &all_q_list, all_q_node) {
> > blk_mq_freeze_queue_wait(q);
> 
> Yeah, I stumbled over that already when analysing all the hotplug notifier
> sites. That's definitely a horrible one.
>  
> > Hohum (sharpens rock), next.
> 
> /me recommends frozen sharks

With the sharp rock below and the one I'll follow up with, master-rt on
my DL980 just passed 3 hours of endless hotplug stress concurrent with
endless tbench 8, stockfish and futextest.  It has never survived this
long with this load by a long shot.

hotplug/rt: Do not let pin_current_cpu() block RCU grace periods

Notifiers may depend upon grace periods continuing to advance
as blk_mq_queue_reinit_notify() below.

crash> bt 8803aee76400
PID: 1113   TASK: 8803aee76400  CPU: 0   COMMAND: "stress-cpu-hotp"
 #0 [880396fe7ad8] __schedule at 816b7142
 #1 [880396fe7b28] schedule at 816b797b
 #2 [880396fe7b48] blk_mq_freeze_queue_wait at 8135c5ac
 #3 [880396fe7b80] blk_mq_queue_reinit_notify at 8135f819
 #4 [880396fe7b98] notifier_call_chain at 8109b8ed
 #5 [880396fe7bd8] __raw_notifier_call_chain at 8109b91e
 #6 [880396fe7be8] __cpu_notify at 81072825
 #7 [880396fe7bf8] cpu_notify_nofail at 81072b15
 #8 [880396fe7c08] notify_dead at 81072d06
 #9 [880396fe7c38] cpuhp_invoke_callback at 81073718
#10 [880396fe7c78] cpuhp_down_callbacks at 81073a70
#11 [880396fe7cb8] _cpu_down at 816afc71
#12 [880396fe7d38] do_cpu_down at 8107435c
#13 [880396fe7d60] cpu_down at 81074390
#14 [880396fe7d70] cpu_subsys_offline at 814cd854
#15 [880396fe7d80] device_offline at 814c7cda
#16 [880396fe7da8] online_store at 814c7dd0
#17 [880396fe7dd0] dev_attr_store at 814c4fc8
#18 [880396fe7de0] sysfs_kf_write at 812cfbe4
#19 [880396fe7e08] kernfs_fop_write at 812cf172
#20 [880396fe7e50] __vfs_write at 81241428
#21 [880396fe7ed0] vfs_write at 81242535
#22 [880396fe7f10] sys_write at 812438f9
#23 [880396fe7f50] entry_SYSCALL_64_fastpath at 816bb4bc
RIP: 7fafd918acd0  RSP: 7ffd2ca956e8  RFLAGS: 0246
RAX: ffda  RBX: 0226a770  RCX: 7fafd918acd0
RDX: 0002  RSI: 7fafd9cb9000  RDI: 0001
RBP: 7ffd2ca95700   R8: 000a   R9: 7fafd9cb3700
R10:   R11: 0246  R12: 0007
R13: 0001  R14: 0009  R15: 000a
ORIG_RAX: 0001  CS: 0033  SS: 002b

blk_mq_queue_reinit_notify:
/*
 * We need to freeze and reinit all existing queues.  Freezing
 * involves synchronous wait for an RCU grace period and doing it
 * one by one may take a long time.  Start freezing all queues in
 * one swoop and then wait for the completions so that freezing can
 * take place in parallel.
 */
list_for_each_entry(q, &all_q_list, all_q_node)
blk_mq_freeze_queue_start(q);
list_for_each_entry(q, &all_q_list, all_q_node) {
blk_mq_freeze_queue_wait(q);

crash> bt 880176cc9900
PID: 17 TASK: 880176cc9900  CPU: 0   COMMAND: "rcu_sched"
 #0 [880176cd7ab8]

Re: [PATCH RT 4/6] rt/locking: Reenable migration accross schedule

2016-03-25 Thread Thomas Gleixner
On Fri, 25 Mar 2016, Mike Galbraith wrote:
> On Thu, 2016-03-24 at 12:06 +0100, Mike Galbraith wrote:
> > On Thu, 2016-03-24 at 11:44 +0100, Thomas Gleixner wrote:
> > >  
> > > > On the bright side, with the busted migrate enable business reverted,
> > > > plus one dinky change from me [1], master-rt.today has completed 100
> > > > iterations of Steven's hotplug stress script along side endless
> > > > futexstress, and is happily doing another 900 as I write this, so the
> > > > next -rt should finally be hotplug deadlock free.
> > > > 
> > > > Thomas's state machinery seems to work wonders.  'course this being
> > > > hotplug, the other shoe will likely apply itself to my backside soon.
> > > 
> > > That's a given :)
> > 
> > blk-mq applied it shortly after I was satisfied enough to poke xmit.
> 
> The other shoe is that notifiers can depend upon RCU grace periods, so
> when pin_current_cpu() snags rcu_sched, the hotplug game is over.
> 
> blk_mq_queue_reinit_notify:
> /*
>  * We need to freeze and reinit all existing queues.  Freezing
>  * involves synchronous wait for an RCU grace period and doing it
>  * one by one may take a long time.  Start freezing all queues in
>  * one swoop and then wait for the completions so that freezing can
>  * take place in parallel.
>  */
> list_for_each_entry(q, &all_q_list, all_q_node)
> blk_mq_freeze_queue_start(q);
> list_for_each_entry(q, &all_q_list, all_q_node) {
> blk_mq_freeze_queue_wait(q);

Yeah, I stumbled over that already when analysing all the hotplug notifier
sites. That's definitely a horrible one.
 
> Hohum (sharpens rock), next.

/me recommends frozen sharks

Thanks,

   tglx


Re: [PATCH RT 4/6] rt/locking: Reenable migration accross schedule

2016-03-24 Thread Mike Galbraith
On Thu, 2016-03-24 at 12:06 +0100, Mike Galbraith wrote:
> On Thu, 2016-03-24 at 11:44 +0100, Thomas Gleixner wrote:
> >  
> > > On the bright side, with the busted migrate enable business reverted,
> > > plus one dinky change from me [1], master-rt.today has completed 100
> > > iterations of Steven's hotplug stress script along side endless
> > > futexstress, and is happily doing another 900 as I write this, so the
> > > next -rt should finally be hotplug deadlock free.
> > > 
> > > Thomas's state machinery seems to work wonders.  'course this being
> > > hotplug, the other shoe will likely apply itself to my backside soon.
> > 
> > That's a given :)
> 
> blk-mq applied it shortly after I was satisfied enough to poke xmit.

The other shoe is that notifiers can depend upon RCU grace periods, so
when pin_current_cpu() snags rcu_sched, the hotplug game is over.

blk_mq_queue_reinit_notify:
/*
 * We need to freeze and reinit all existing queues.  Freezing
 * involves synchronous wait for an RCU grace period and doing it
 * one by one may take a long time.  Start freezing all queues in
 * one swoop and then wait for the completions so that freezing can
 * take place in parallel.
 */
list_for_each_entry(q, &all_q_list, all_q_node)
blk_mq_freeze_queue_start(q);
list_for_each_entry(q, &all_q_list, all_q_node) {
blk_mq_freeze_queue_wait(q);

Hohum (sharpens rock), next.

-Mike


Re: [PATCH RT 4/6] rt/locking: Reenable migration accross schedule

2016-03-24 Thread Mike Galbraith
On Thu, 2016-03-24 at 11:44 +0100, Thomas Gleixner wrote:
>  
> > On the bright side, with the busted migrate enable business reverted,
> > plus one dinky change from me [1], master-rt.today has completed 100
> > iterations of Steven's hotplug stress script along side endless
> > futexstress, and is happily doing another 900 as I write this, so the
> > next -rt should finally be hotplug deadlock free.
> > 
> > Thomas's state machinery seems to work wonders.  'course this being
> > hotplug, the other shoe will likely apply itself to my backside soon.
> 
> That's a given :)

blk-mq applied it shortly after I was satisfied enough to poke xmit.

> I really wonder what makes the change. The only thing which comes to my mind
> is the enforcement of running the online and down_prepare callbacks on the
> plugged cpu instead of doing it wherever the scheduler decides to run it.

No idea, but it certainly seems.. well, markedly less brick like.
 
> > 1. nest module_mutex inside hotplug_lock to prevent bloody systemd
> > -udevd from blocking in migrate_disable() while holding kernfs_mutex
> > during module load, putting a quick end to hotplug stress testing.
>  
> Did I miss a patch here or is that still in your pile?

You didn't miss it, it wasn't tested enough to consider sending.. and
now I'm starting down the familiar "next" path again.  Oh dear.

-Mike


Re: [PATCH RT 4/6] rt/locking: Reenable migration accross schedule

2016-03-24 Thread Thomas Gleixner
On Thu, 24 Mar 2016, Mike Galbraith wrote:
> On Sun, 2016-03-20 at 09:43 +0100, Mike Galbraith wrote:
> > On Sat, 2016-02-13 at 00:02 +0100, Sebastian Andrzej Siewior wrote:
> > > From: Thomas Gleixner 
> > > 
> > > We currently disable migration across lock acquisition. That includes the 
> > > part
> > > where we block on the lock and schedule out. We cannot disable migration 
> > > after
> > > taking the lock as that would cause a possible lock inversion.
> > > 
> > > But we can be smart and enable migration when we block and schedule out. 
> > > That
> > > allows the scheduler to place the task freely at least if this is the 
> > > first
> > > migrate disable level. For nested locking this does not help at all.
> > 
> > I met a problem while testing shiny new hotplug machinery.
> > 
> > rt/locking: Fix rt_spin_lock_slowlock() vs hotplug migrate_disable() bug
> > 
> > migrate_disable() -> pin_current_cpu() -> hotplug_lock() leads to..
> > > BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on));
> > ..so let's call migrate_disable() after we acquire the lock instead.
> 
> Well crap, that wasn't very clever  A little voice kept nagging me, and
> yesterday I realized what it was grumbling about, namely that doing
> migrate_disable() after lock acquisition will resurrect a hotplug
> deadlock that we fixed up a while back.

Glad you found out yourself. Telling you that was on my todo list 
 
> On the bright side, with the busted migrate enable business reverted,
> plus one dinky change from me [1], master-rt.today has completed 100
> iterations of Steven's hotplug stress script along side endless
> futexstress, and is happily doing another 900 as I write this, so the
> next -rt should finally be hotplug deadlock free.
> 
> Thomas's state machinery seems to work wonders.  'course this being
> hotplug, the other shoe will likely apply itself to my backside soon.

That's a given :)

I really wonder what makes the change. The only thing which comes to my mind
is the enforcement of running the online and down_prepare callbacks on the
plugged cpu instead of doing it wherever the scheduler decides to run it.
 
> 1. nest module_mutex inside hotplug_lock to prevent bloody systemd
> -udevd from blocking in migrate_disable() while holding kernfs_mutex
> during module load, putting a quick end to hotplug stress testing.
 
Did I miss a patch here or is that still in your pile?

Thanks,

tglx


Re: [PATCH RT 4/6] rt/locking: Reenable migration accross schedule

2016-03-24 Thread Mike Galbraith
On Sun, 2016-03-20 at 09:43 +0100, Mike Galbraith wrote:
> On Sat, 2016-02-13 at 00:02 +0100, Sebastian Andrzej Siewior wrote:
> > From: Thomas Gleixner 
> > 
> > We currently disable migration across lock acquisition. That includes the 
> > part
> > where we block on the lock and schedule out. We cannot disable migration 
> > after
> > taking the lock as that would cause a possible lock inversion.
> > 
> > But we can be smart and enable migration when we block and schedule out. 
> > That
> > allows the scheduler to place the task freely at least if this is the first
> > migrate disable level. For nested locking this does not help at all.
> 
> I met a problem while testing shiny new hotplug machinery.
> 
> rt/locking: Fix rt_spin_lock_slowlock() vs hotplug migrate_disable() bug
> 
> migrate_disable() -> pin_current_cpu() -> hotplug_lock() leads to..
>   > BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on));
> ..so let's call migrate_disable() after we acquire the lock instead.

Well crap, that wasn't very clever  A little voice kept nagging me, and
yesterday I realized what it was grumbling about, namely that doing
migrate_disable() after lock acquisition will resurrect a hotplug
deadlock that we fixed up a while back.

On the bright side, with the busted migrate enable business reverted,
plus one dinky change from me [1], master-rt.today has completed 100
iterations of Steven's hotplug stress script along side endless
futexstress, and is happily doing another 900 as I write this, so the
next -rt should finally be hotplug deadlock free.

Thomas's state machinery seems to work wonders.  'course this being
hotplug, the other shoe will likely apply itself to my backside soon.

-Mike

1. nest module_mutex inside hotplug_lock to prevent bloody systemd
-udevd from blocking in migrate_disable() while holding kernfs_mutex
during module load, putting a quick end to hotplug stress testing.


Re: [PATCH RT 4/6] rt/locking: Reenable migration accross schedule

2016-03-20 Thread Mike Galbraith
On Sat, 2016-02-13 at 00:02 +0100, Sebastian Andrzej Siewior wrote:
> From: Thomas Gleixner 
> 
> We currently disable migration across lock acquisition. That includes the part
> where we block on the lock and schedule out. We cannot disable migration after
> taking the lock as that would cause a possible lock inversion.
> 
> But we can be smart and enable migration when we block and schedule out. That
> allows the scheduler to place the task freely at least if this is the first
> migrate disable level. For nested locking this does not help at all.

I met a problem while testing shiny new hotplug machinery.

rt/locking: Fix rt_spin_lock_slowlock() vs hotplug migrate_disable() bug

migrate_disable() -> pin_current_cpu() -> hotplug_lock() leads to..
BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on));
..so let's call migrate_disable() after we acquire the lock instead.

Fixes: e24b142cfb4a rt/locking: Reenable migration accross schedule
Signed-off-by: Mike Galbraith 
---
 kernel/locking/rtmutex.c |   15 +--
 1 file changed, 9 insertions(+), 6 deletions(-)

--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1011,7 +1011,7 @@ static void  noinline __sched rt_spin_lo
struct task_struct *lock_owner, *self = current;
struct rt_mutex_waiter waiter, *top_waiter;
unsigned long flags;
-   int ret;
+   bool mg_disable = false;
 
rt_mutex_init_waiter(&waiter, true);
 
@@ -1035,8 +1035,7 @@ static void  noinline __sched rt_spin_lo
__set_current_state_no_track(TASK_UNINTERRUPTIBLE);
raw_spin_unlock(&self->pi_lock);
 
-   ret = task_blocks_on_rt_mutex(lock, &waiter, self, 
RT_MUTEX_MIN_CHAINWALK);
-   BUG_ON(ret);
+   BUG_ON(task_blocks_on_rt_mutex(lock, &waiter, self, 
RT_MUTEX_MIN_CHAINWALK));
 
for (;;) {
/* Try to acquire the lock again. */
@@ -1051,11 +1050,12 @@ static void  noinline __sched rt_spin_lo
debug_rt_mutex_print_deadlock(&waiter);
 
if (top_waiter != &waiter || adaptive_wait(lock, lock_owner)) {
-   if (mg_off)
+   if (mg_off && self->migrate_disable == 1) {
+   mg_off = false;
+   mg_disable = true;
migrate_enable();
+   }
schedule();
-   if (mg_off)
-   migrate_disable();
}
 
raw_spin_lock_irqsave(&lock->wait_lock, flags);
@@ -1088,6 +1088,9 @@ static void  noinline __sched rt_spin_lo
 
raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
 
+   if (mg_disable)
+   migrate_disable();
+
debug_rt_mutex_free_waiter(&waiter);
 }