Re: Use SMR_TAILQ for `ps_threads'

2020-12-07 Thread Claudio Jeker
On Mon, Dec 07, 2020 at 10:13:44AM -0300, Martin Pieuchot wrote:
> On 05/12/20(Sat) 22:34, Jonathan Matthew wrote:
> > On Fri, Dec 04, 2020 at 10:03:46AM -0300, Martin Pieuchot wrote:
> > > On 04/12/20(Fri) 12:01, Jonathan Matthew wrote:
> > > > On Wed, Dec 02, 2020 at 11:41:04AM -0300, Martin Pieuchot wrote:
> > > > > [...] 
> > > > > Could you try the diff below that only call smr_barrier() for multi-
> > > > > threaded processes with threads still in the list.  I guess this also
> > > > > answers guenther@'s question.  The same could be done with 
> > > > > smr_flush().
> > > > 
> > > > This removes the overhead, more or less.  Are we only looking at 
> > > > unlocking access
> > > > to ps_threads from within a process (not the sysctl or ptrace stuff)?  
> > > > Otherwise
> > > > this doesn't seem safe.
> > > 
> > > I'd argue that if `ps_thread' is being iterated the CPU doing the
> > > iteration must already have a reference to the "struct process" so
> > > the serialization should be done on this reference.
> > 
> > Sounds reasonable to me.
> > 
> > > 
> > > Now I doubt we'll be able to answer all the questions right now.  If we
> > > can find a path forward that doesn't decrease performances too much and
> > > allow us to move signal delivery and sleep out of the KERNEL_LOCK()
> > > that's a huge win.
> > 
> > I think we're at an acceptable performance hit now, so if this lets you
> > progress with unlocking signal delivery, I'm happy.
> 
> Ok, so here's the conversion to SMR_TAILQ again.  Any ok for this one
> before we choose if we go with smr_barrier(), smr_flush() or a callback?

This is still OK claudio@. Once this is in I can dig out my callback diff.
 
> Index: lib/libkvm/kvm_proc2.c
> ===
> RCS file: /cvs/src/lib/libkvm/kvm_proc2.c,v
> retrieving revision 1.31
> diff -u -p -r1.31 kvm_proc2.c
> --- lib/libkvm/kvm_proc2.c11 Dec 2019 12:36:28 -  1.31
> +++ lib/libkvm/kvm_proc2.c7 Dec 2020 12:46:48 -
> @@ -341,8 +341,9 @@ kvm_proclist(kvm_t *kd, int op, int arg,
>   kp.p_pctcpu = 0;
>   kp.p_stat = (process.ps_flags & PS_ZOMBIE) ? SDEAD :
>   SIDL;
> - for (p = TAILQ_FIRST(_threads); p != NULL; 
> - p = TAILQ_NEXT(, p_thr_link)) {
> + for (p = SMR_TAILQ_FIRST_LOCKED(_threads);
> + p != NULL;
> + p = SMR_TAILQ_NEXT_LOCKED(, p_thr_link)) {
>   if (KREAD(kd, (u_long)p, )) {
>   _kvm_err(kd, kd->program,
>   "can't read proc at %lx",
> @@ -376,8 +377,8 @@ kvm_proclist(kvm_t *kd, int op, int arg,
>   if (!dothreads)
>   continue;
>  
> - for (p = TAILQ_FIRST(_threads); p != NULL; 
> - p = TAILQ_NEXT(, p_thr_link)) {
> + for (p = SMR_TAILQ_FIRST_LOCKED(_threads); p != NULL;
> + p = SMR_TAILQ_NEXT_LOCKED(, p_thr_link)) {
>   if (KREAD(kd, (u_long)p, )) {
>   _kvm_err(kd, kd->program,
>   "can't read proc at %lx",
> Index: sys/kern/exec_elf.c
> ===
> RCS file: /cvs/src/sys/kern/exec_elf.c,v
> retrieving revision 1.155
> diff -u -p -r1.155 exec_elf.c
> --- sys/kern/exec_elf.c   6 Jul 2020 13:33:09 -   1.155
> +++ sys/kern/exec_elf.c   7 Dec 2020 12:46:48 -
> @@ -85,6 +85,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include 
>  
> @@ -1360,7 +1361,7 @@ coredump_notes_elf(struct proc *p, void 
>* threads in the process have been stopped and the list can't
>* change.
>*/
> - TAILQ_FOREACH(q, >ps_threads, p_thr_link) {
> + SMR_TAILQ_FOREACH_LOCKED(q, >ps_threads, p_thr_link) {
>   if (q == p) /* we've taken care of this thread */
>   continue;
>   error = coredump_note_elf(q, iocookie, );
> Index: sys/kern/init_main.c
> ===
> RCS file: /cvs/src/sys/kern/init_main.c,v
> retrieving revision 1.301
> diff -u -p -r1.301 init_main.c
> --- sys/kern/init_main.c  13 Sep 2020 09:42:31 -  1.301
> +++ sys/kern/init_main.c  7 Dec 2020 12:46:48 -
> @@ -519,7 +519,7 @@ main(void *framep)
>*/
>   LIST_FOREACH(pr, , ps_list) {
>   nanouptime(>ps_start);
> - TAILQ_FOREACH(p, >ps_threads, p_thr_link) {
> + SMR_TAILQ_FOREACH_LOCKED(p, >ps_threads, p_thr_link) {
>   nanouptime(>p_cpu->ci_schedstate.spc_runtime);
>   timespecclear(>p_rtime);
>   }
> Index: sys/kern/kern_exit.c
> 

Re: Use SMR_TAILQ for `ps_threads'

2020-12-07 Thread Martin Pieuchot
On 05/12/20(Sat) 22:34, Jonathan Matthew wrote:
> On Fri, Dec 04, 2020 at 10:03:46AM -0300, Martin Pieuchot wrote:
> > On 04/12/20(Fri) 12:01, Jonathan Matthew wrote:
> > > On Wed, Dec 02, 2020 at 11:41:04AM -0300, Martin Pieuchot wrote:
> > > > [...] 
> > > > Could you try the diff below that only call smr_barrier() for multi-
> > > > threaded processes with threads still in the list.  I guess this also
> > > > answers guenther@'s question.  The same could be done with smr_flush().
> > > 
> > > This removes the overhead, more or less.  Are we only looking at 
> > > unlocking access
> > > to ps_threads from within a process (not the sysctl or ptrace stuff)?  
> > > Otherwise
> > > this doesn't seem safe.
> > 
> > I'd argue that if `ps_thread' is being iterated the CPU doing the
> > iteration must already have a reference to the "struct process" so
> > the serialization should be done on this reference.
> 
> Sounds reasonable to me.
> 
> > 
> > Now I doubt we'll be able to answer all the questions right now.  If we
> > can find a path forward that doesn't decrease performances too much and
> > allow us to move signal delivery and sleep out of the KERNEL_LOCK()
> > that's a huge win.
> 
> I think we're at an acceptable performance hit now, so if this lets you
> progress with unlocking signal delivery, I'm happy.

Ok, so here's the conversion to SMR_TAILQ again.  Any ok for this one
before we choose if we go with smr_barrier(), smr_flush() or a callback?

Index: lib/libkvm/kvm_proc2.c
===
RCS file: /cvs/src/lib/libkvm/kvm_proc2.c,v
retrieving revision 1.31
diff -u -p -r1.31 kvm_proc2.c
--- lib/libkvm/kvm_proc2.c  11 Dec 2019 12:36:28 -  1.31
+++ lib/libkvm/kvm_proc2.c  7 Dec 2020 12:46:48 -
@@ -341,8 +341,9 @@ kvm_proclist(kvm_t *kd, int op, int arg,
kp.p_pctcpu = 0;
kp.p_stat = (process.ps_flags & PS_ZOMBIE) ? SDEAD :
SIDL;
-   for (p = TAILQ_FIRST(_threads); p != NULL; 
-   p = TAILQ_NEXT(, p_thr_link)) {
+   for (p = SMR_TAILQ_FIRST_LOCKED(_threads);
+   p != NULL;
+   p = SMR_TAILQ_NEXT_LOCKED(, p_thr_link)) {
if (KREAD(kd, (u_long)p, )) {
_kvm_err(kd, kd->program,
"can't read proc at %lx",
@@ -376,8 +377,8 @@ kvm_proclist(kvm_t *kd, int op, int arg,
if (!dothreads)
continue;
 
-   for (p = TAILQ_FIRST(_threads); p != NULL; 
-   p = TAILQ_NEXT(, p_thr_link)) {
+   for (p = SMR_TAILQ_FIRST_LOCKED(_threads); p != NULL;
+   p = SMR_TAILQ_NEXT_LOCKED(, p_thr_link)) {
if (KREAD(kd, (u_long)p, )) {
_kvm_err(kd, kd->program,
"can't read proc at %lx",
Index: sys/kern/exec_elf.c
===
RCS file: /cvs/src/sys/kern/exec_elf.c,v
retrieving revision 1.155
diff -u -p -r1.155 exec_elf.c
--- sys/kern/exec_elf.c 6 Jul 2020 13:33:09 -   1.155
+++ sys/kern/exec_elf.c 7 Dec 2020 12:46:48 -
@@ -85,6 +85,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -1360,7 +1361,7 @@ coredump_notes_elf(struct proc *p, void 
 * threads in the process have been stopped and the list can't
 * change.
 */
-   TAILQ_FOREACH(q, >ps_threads, p_thr_link) {
+   SMR_TAILQ_FOREACH_LOCKED(q, >ps_threads, p_thr_link) {
if (q == p) /* we've taken care of this thread */
continue;
error = coredump_note_elf(q, iocookie, );
Index: sys/kern/init_main.c
===
RCS file: /cvs/src/sys/kern/init_main.c,v
retrieving revision 1.301
diff -u -p -r1.301 init_main.c
--- sys/kern/init_main.c13 Sep 2020 09:42:31 -  1.301
+++ sys/kern/init_main.c7 Dec 2020 12:46:48 -
@@ -519,7 +519,7 @@ main(void *framep)
 */
LIST_FOREACH(pr, , ps_list) {
nanouptime(>ps_start);
-   TAILQ_FOREACH(p, >ps_threads, p_thr_link) {
+   SMR_TAILQ_FOREACH_LOCKED(p, >ps_threads, p_thr_link) {
nanouptime(>p_cpu->ci_schedstate.spc_runtime);
timespecclear(>p_rtime);
}
Index: sys/kern/kern_exit.c
===
RCS file: /cvs/src/sys/kern/kern_exit.c,v
retrieving revision 1.191
diff -u -p -r1.191 kern_exit.c
--- sys/kern/kern_exit.c16 Nov 2020 18:37:06 -  1.191
+++ sys/kern/kern_exit.c7 Dec 2020 12:46:48 -
@@ -63,6 +63,7 @@
 #ifdef SYSVSEM
 #include 
 #endif

Re: Use SMR_TAILQ for `ps_threads'

2020-12-05 Thread Jonathan Matthew
On Fri, Dec 04, 2020 at 10:03:46AM -0300, Martin Pieuchot wrote:
> On 04/12/20(Fri) 12:01, Jonathan Matthew wrote:
> > On Wed, Dec 02, 2020 at 11:41:04AM -0300, Martin Pieuchot wrote:
> > > [...] 
> > > Could you try the diff below that only call smr_barrier() for multi-
> > > threaded processes with threads still in the list.  I guess this also
> > > answers guenther@'s question.  The same could be done with smr_flush().
> > 
> > This removes the overhead, more or less.  Are we only looking at unlocking 
> > access
> > to ps_threads from within a process (not the sysctl or ptrace stuff)?  
> > Otherwise
> > this doesn't seem safe.
> 
> I'd argue that if `ps_thread' is being iterated the CPU doing the
> iteration must already have a reference to the "struct process" so
> the serialization should be done on this reference.

Sounds reasonable to me.

> 
> Now I doubt we'll be able to answer all the questions right now.  If we
> can find a path forward that doesn't decrease performances too much and
> allow us to move signal delivery and sleep out of the KERNEL_LOCK()
> that's a huge win.

I think we're at an acceptable performance hit now, so if this lets you
progress with unlocking signal delivery, I'm happy.



Re: Use SMR_TAILQ for `ps_threads'

2020-12-04 Thread Martin Pieuchot
On 04/12/20(Fri) 12:01, Jonathan Matthew wrote:
> On Wed, Dec 02, 2020 at 11:41:04AM -0300, Martin Pieuchot wrote:
> > [...] 
> > Could you try the diff below that only call smr_barrier() for multi-
> > threaded processes with threads still in the list.  I guess this also
> > answers guenther@'s question.  The same could be done with smr_flush().
> 
> This removes the overhead, more or less.  Are we only looking at unlocking 
> access
> to ps_threads from within a process (not the sysctl or ptrace stuff)?  
> Otherwise
> this doesn't seem safe.

I'd argue that if `ps_thread' is being iterated the CPU doing the
iteration must already have a reference to the "struct process" so
the serialization should be done on this reference.

Now I doubt we'll be able to answer all the questions right now.  If we
can find a path forward that doesn't decrease performances too much and
allow us to move signal delivery and sleep out of the KERNEL_LOCK()
that's a huge win.



Re: Use SMR_TAILQ for `ps_threads'

2020-12-03 Thread Jonathan Matthew
On Wed, Dec 02, 2020 at 07:44:02PM +0100, Anton Lindqvist wrote:
> On Wed, Dec 02, 2020 at 11:41:04AM -0300, Martin Pieuchot wrote:
> > On 02/12/20(Wed) 17:27, Jonathan Matthew wrote:
> > > On Tue, Dec 01, 2020 at 02:35:18PM -0300, Martin Pieuchot wrote:
> > > > On 01/12/20(Tue) 15:30, Claudio Jeker wrote:
> > > > > [...] 
> > > > > Did you run a make build with that smr_barrier() in it and checked 
> > > > > that it
> > > > > does not cause a slow down? I am sceptical, smr_barrier() is a very 
> > > > > slow
> > > > > construct which introduces large delays and should be avoided whenever
> > > > > possible.
> > > > 
> > > > I did build GENERIC.MP multiple times on a 4CPU sparc64 with the diff
> > > > below, without noticeable difference.
> > > > 
> > > > I'm happy to hear from sceptical performance checkers :o)
> > > 
> > > On a reasonably fast amd64 box, this increases GENERIC.MP make -j6 build 
> > > time from
> > > ~3m06s to ~3m44s, which seems a bit much to me.
> > 
> > Do you know if this is due to an increase of %spin time?
> > 
> > > Replacing smr_barrier() with smr_flush() reduces the overhead to a couple 
> > > of
> > > seconds, and it seems warranted here.
> > 
> > Could you try the diff below that only call smr_barrier() for multi-
> > threaded processes with threads still in the list.  I guess this also
> > answers guenther@'s question.  The same could be done with smr_flush().
> 
> I'm wondering if smr_grace_wait() could be improved on amd64, assuming
> SMT is disabled, by skipping offline CPUs.

This doesn't make much of a difference when using smr_barrier(), but with
smr_flush() it removes much of the overhead on this system with 8 of 16 cpus
online.  Of course as Visa and Mark point out this is risky without more
guarantees about what offline cpus are actually doing.  If we start using SMR
in ways that make the delay visible to user processes, it'll probably be worth
looking at.

> 
> Index: kern/kern_smr.c
> ===
> RCS file: /cvs/src/sys/kern/kern_smr.c,v
> retrieving revision 1.8
> diff -u -p -r1.8 kern_smr.c
> --- kern/kern_smr.c   3 Apr 2020 03:36:56 -   1.8
> +++ kern/kern_smr.c   2 Dec 2020 18:41:29 -
> @@ -142,7 +142,7 @@ smr_grace_wait(void)
>  
>   ci_start = curcpu();
>   CPU_INFO_FOREACH(cii, ci) {
> - if (ci == ci_start)
> + if (ci == ci_start || !cpu_is_online(ci))
>   continue;
>   sched_peg_curproc(ci);
>   }
> 



Re: Use SMR_TAILQ for `ps_threads'

2020-12-03 Thread Jonathan Matthew
On Wed, Dec 02, 2020 at 11:41:04AM -0300, Martin Pieuchot wrote:
> On 02/12/20(Wed) 17:27, Jonathan Matthew wrote:
> > On Tue, Dec 01, 2020 at 02:35:18PM -0300, Martin Pieuchot wrote:
> > > On 01/12/20(Tue) 15:30, Claudio Jeker wrote:
> > > > [...] 
> > > > Did you run a make build with that smr_barrier() in it and checked that 
> > > > it
> > > > does not cause a slow down? I am sceptical, smr_barrier() is a very slow
> > > > construct which introduces large delays and should be avoided whenever
> > > > possible.
> > > 
> > > I did build GENERIC.MP multiple times on a 4CPU sparc64 with the diff
> > > below, without noticeable difference.
> > > 
> > > I'm happy to hear from sceptical performance checkers :o)
> > 
> > On a reasonably fast amd64 box, this increases GENERIC.MP make -j6 build 
> > time from
> > ~3m06s to ~3m44s, which seems a bit much to me.
> 
> Do you know if this is due to an increase of %spin time?

It actually decreased %spin, and the total system cpu time used during the 
build was
decreased from around 6m30s to around 5m15, so I think it's mostly the effect 
of the
delayed wakeup of the SMR thread in smr_dispatch().

There's also this:

$ time sleep 1
0m01.11s real 0m00.00s user 0m00.00s system


> 
> > Replacing smr_barrier() with smr_flush() reduces the overhead to a couple of
> > seconds, and it seems warranted here.
> 
> Could you try the diff below that only call smr_barrier() for multi-
> threaded processes with threads still in the list.  I guess this also
> answers guenther@'s question.  The same could be done with smr_flush().

This removes the overhead, more or less.  Are we only looking at unlocking 
access
to ps_threads from within a process (not the sysctl or ptrace stuff)?  Otherwise
this doesn't seem safe.



Re: Use SMR_TAILQ for `ps_threads'

2020-12-03 Thread Mark Kettenis
> Date: Wed, 2 Dec 2020 19:44:02 +0100
> From: Anton Lindqvist 
> 
> On Wed, Dec 02, 2020 at 11:41:04AM -0300, Martin Pieuchot wrote:
> > On 02/12/20(Wed) 17:27, Jonathan Matthew wrote:
> > > On Tue, Dec 01, 2020 at 02:35:18PM -0300, Martin Pieuchot wrote:
> > > > On 01/12/20(Tue) 15:30, Claudio Jeker wrote:
> > > > > [...] 
> > > > > Did you run a make build with that smr_barrier() in it and checked 
> > > > > that it
> > > > > does not cause a slow down? I am sceptical, smr_barrier() is a very 
> > > > > slow
> > > > > construct which introduces large delays and should be avoided whenever
> > > > > possible.
> > > > 
> > > > I did build GENERIC.MP multiple times on a 4CPU sparc64 with the diff
> > > > below, without noticeable difference.
> > > > 
> > > > I'm happy to hear from sceptical performance checkers :o)
> > > 
> > > On a reasonably fast amd64 box, this increases GENERIC.MP make -j6 build 
> > > time from
> > > ~3m06s to ~3m44s, which seems a bit much to me.
> > 
> > Do you know if this is due to an increase of %spin time?
> > 
> > > Replacing smr_barrier() with smr_flush() reduces the overhead to a couple 
> > > of
> > > seconds, and it seems warranted here.
> > 
> > Could you try the diff below that only call smr_barrier() for multi-
> > threaded processes with threads still in the list.  I guess this also
> > answers guenther@'s question.  The same could be done with smr_flush().
> 
> I'm wondering if smr_grace_wait() could be improved on amd64, assuming
> SMT is disabled, by skipping offline CPUs.

I think this would come back to bite us at some point.

> Index: kern/kern_smr.c
> ===
> RCS file: /cvs/src/sys/kern/kern_smr.c,v
> retrieving revision 1.8
> diff -u -p -r1.8 kern_smr.c
> --- kern/kern_smr.c   3 Apr 2020 03:36:56 -   1.8
> +++ kern/kern_smr.c   2 Dec 2020 18:41:29 -
> @@ -142,7 +142,7 @@ smr_grace_wait(void)
>  
>   ci_start = curcpu();
>   CPU_INFO_FOREACH(cii, ci) {
> - if (ci == ci_start)
> + if (ci == ci_start || !cpu_is_online(ci))
>   continue;
>   sched_peg_curproc(ci);
>   }
> 
> 



Re: Use SMR_TAILQ for `ps_threads'

2020-12-03 Thread Visa Hankala
On Wed, Dec 02, 2020 at 07:44:02PM +0100, Anton Lindqvist wrote:
> I'm wondering if smr_grace_wait() could be improved on amd64, assuming
> SMT is disabled, by skipping offline CPUs.
> 
> Index: kern/kern_smr.c
> ===
> RCS file: /cvs/src/sys/kern/kern_smr.c,v
> retrieving revision 1.8
> diff -u -p -r1.8 kern_smr.c
> --- kern/kern_smr.c   3 Apr 2020 03:36:56 -   1.8
> +++ kern/kern_smr.c   2 Dec 2020 18:41:29 -
> @@ -142,7 +142,7 @@ smr_grace_wait(void)
>  
>   ci_start = curcpu();
>   CPU_INFO_FOREACH(cii, ci) {
> - if (ci == ci_start)
> + if (ci == ci_start || !cpu_is_online(ci))
>   continue;
>   sched_peg_curproc(ci);
>   }

That is not safe. The code should coordinate with switching of hw.smt.
Also, the CPUs that are offline because of hw.smt are not totally
inactive. They might still access SMR-protected data as a result of
interrupts for example.



Re: Use SMR_TAILQ for `ps_threads'

2020-12-02 Thread Anton Lindqvist
On Wed, Dec 02, 2020 at 11:41:04AM -0300, Martin Pieuchot wrote:
> On 02/12/20(Wed) 17:27, Jonathan Matthew wrote:
> > On Tue, Dec 01, 2020 at 02:35:18PM -0300, Martin Pieuchot wrote:
> > > On 01/12/20(Tue) 15:30, Claudio Jeker wrote:
> > > > [...] 
> > > > Did you run a make build with that smr_barrier() in it and checked that 
> > > > it
> > > > does not cause a slow down? I am sceptical, smr_barrier() is a very slow
> > > > construct which introduces large delays and should be avoided whenever
> > > > possible.
> > > 
> > > I did build GENERIC.MP multiple times on a 4CPU sparc64 with the diff
> > > below, without noticeable difference.
> > > 
> > > I'm happy to hear from sceptical performance checkers :o)
> > 
> > On a reasonably fast amd64 box, this increases GENERIC.MP make -j6 build 
> > time from
> > ~3m06s to ~3m44s, which seems a bit much to me.
> 
> Do you know if this is due to an increase of %spin time?
> 
> > Replacing smr_barrier() with smr_flush() reduces the overhead to a couple of
> > seconds, and it seems warranted here.
> 
> Could you try the diff below that only call smr_barrier() for multi-
> threaded processes with threads still in the list.  I guess this also
> answers guenther@'s question.  The same could be done with smr_flush().

I'm wondering if smr_grace_wait() could be improved on amd64, assuming
SMT is disabled, by skipping offline CPUs.

Index: kern/kern_smr.c
===
RCS file: /cvs/src/sys/kern/kern_smr.c,v
retrieving revision 1.8
diff -u -p -r1.8 kern_smr.c
--- kern/kern_smr.c 3 Apr 2020 03:36:56 -   1.8
+++ kern/kern_smr.c 2 Dec 2020 18:41:29 -
@@ -142,7 +142,7 @@ smr_grace_wait(void)
 
ci_start = curcpu();
CPU_INFO_FOREACH(cii, ci) {
-   if (ci == ci_start)
+   if (ci == ci_start || !cpu_is_online(ci))
continue;
sched_peg_curproc(ci);
}



Re: Use SMR_TAILQ for `ps_threads'

2020-12-02 Thread Martin Pieuchot
On 02/12/20(Wed) 17:27, Jonathan Matthew wrote:
> On Tue, Dec 01, 2020 at 02:35:18PM -0300, Martin Pieuchot wrote:
> > On 01/12/20(Tue) 15:30, Claudio Jeker wrote:
> > > [...] 
> > > Did you run a make build with that smr_barrier() in it and checked that it
> > > does not cause a slow down? I am sceptical, smr_barrier() is a very slow
> > > construct which introduces large delays and should be avoided whenever
> > > possible.
> > 
> > I did build GENERIC.MP multiple times on a 4CPU sparc64 with the diff
> > below, without noticeable difference.
> > 
> > I'm happy to hear from sceptical performance checkers :o)
> 
> On a reasonably fast amd64 box, this increases GENERIC.MP make -j6 build time 
> from
> ~3m06s to ~3m44s, which seems a bit much to me.

Do you know if this is due to an increase of %spin time?

> Replacing smr_barrier() with smr_flush() reduces the overhead to a couple of
> seconds, and it seems warranted here.

Could you try the diff below that only call smr_barrier() for multi-
threaded processes with threads still in the list.  I guess this also
answers guenther@'s question.  The same could be done with smr_flush().

diff --git lib/libkvm/kvm_proc2.c lib/libkvm/kvm_proc2.c
index 96f7dc91b92..1f4f9b914bb 100644
--- lib/libkvm/kvm_proc2.c
+++ lib/libkvm/kvm_proc2.c
@@ -341,8 +341,9 @@ kvm_proclist(kvm_t *kd, int op, int arg, struct process *pr,
kp.p_pctcpu = 0;
kp.p_stat = (process.ps_flags & PS_ZOMBIE) ? SDEAD :
SIDL;
-   for (p = TAILQ_FIRST(_threads); p != NULL; 
-   p = TAILQ_NEXT(, p_thr_link)) {
+   for (p = SMR_TAILQ_FIRST_LOCKED(_threads);
+   p != NULL;
+   p = SMR_TAILQ_NEXT_LOCKED(, p_thr_link)) {
if (KREAD(kd, (u_long)p, )) {
_kvm_err(kd, kd->program,
"can't read proc at %lx",
@@ -376,8 +377,8 @@ kvm_proclist(kvm_t *kd, int op, int arg, struct process *pr,
if (!dothreads)
continue;
 
-   for (p = TAILQ_FIRST(_threads); p != NULL; 
-   p = TAILQ_NEXT(, p_thr_link)) {
+   for (p = SMR_TAILQ_FIRST_LOCKED(_threads); p != NULL;
+   p = SMR_TAILQ_NEXT_LOCKED(, p_thr_link)) {
if (KREAD(kd, (u_long)p, )) {
_kvm_err(kd, kd->program,
"can't read proc at %lx",
diff --git sys/kern/exec_elf.c sys/kern/exec_elf.c
index 5e455208663..575273b306c 100644
--- sys/kern/exec_elf.c
+++ sys/kern/exec_elf.c
@@ -85,6 +85,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -1360,7 +1361,7 @@ coredump_notes_elf(struct proc *p, void *iocookie, size_t 
*sizep)
 * threads in the process have been stopped and the list can't
 * change.
 */
-   TAILQ_FOREACH(q, >ps_threads, p_thr_link) {
+   SMR_TAILQ_FOREACH_LOCKED(q, >ps_threads, p_thr_link) {
if (q == p) /* we've taken care of this thread */
continue;
error = coredump_note_elf(q, iocookie, );
diff --git sys/kern/init_main.c sys/kern/init_main.c
index fed6be19435..2b657ffe328 100644
--- sys/kern/init_main.c
+++ sys/kern/init_main.c
@@ -519,7 +519,7 @@ main(void *framep)
 */
LIST_FOREACH(pr, , ps_list) {
nanouptime(>ps_start);
-   TAILQ_FOREACH(p, >ps_threads, p_thr_link) {
+   SMR_TAILQ_FOREACH_LOCKED(p, >ps_threads, p_thr_link) {
nanouptime(>p_cpu->ci_schedstate.spc_runtime);
timespecclear(>p_rtime);
}
diff --git sys/kern/kern_exit.c sys/kern/kern_exit.c
index a20775419e3..8cde5eca266 100644
--- sys/kern/kern_exit.c
+++ sys/kern/kern_exit.c
@@ -63,6 +63,7 @@
 #ifdef SYSVSEM
 #include 
 #endif
+#include 
 #include 
 
 #include 
@@ -161,7 +162,9 @@ exit1(struct proc *p, int xexit, int xsig, int flags)
}
 
/* unlink ourselves from the active threads */
-   TAILQ_REMOVE(>ps_threads, p, p_thr_link);
+   SMR_TAILQ_REMOVE_LOCKED(>ps_threads, p, p_thr_link);
+   if (!SMR_TAILQ_EMPTY_LOCKED(>ps_threads))
+   smr_barrier();
if ((p->p_flag & P_THREAD) == 0) {
/* main thread gotta wait because it has the pid, et al */
while (pr->ps_refcnt > 1)
@@ -724,7 +727,7 @@ process_zap(struct process *pr)
if (pr->ps_ptstat != NULL)
free(pr->ps_ptstat, M_SUBPROC, sizeof(*pr->ps_ptstat));
pool_put(_pool, pr->ps_ru);
-   KASSERT(TAILQ_EMPTY(>ps_threads));
+   KASSERT(SMR_TAILQ_EMPTY_LOCKED(>ps_threads));
lim_free(pr->ps_limit);
crfree(pr->ps_ucred);
pool_put(_pool, pr);
diff --git sys/kern/kern_fork.c sys/kern/kern_fork.c
index 

Re: Use SMR_TAILQ for `ps_threads'

2020-12-02 Thread Claudio Jeker
On Wed, Dec 02, 2020 at 05:27:59PM +1000, Jonathan Matthew wrote:
> On Tue, Dec 01, 2020 at 02:35:18PM -0300, Martin Pieuchot wrote:
> > On 01/12/20(Tue) 15:30, Claudio Jeker wrote:
> > > [...] 
> > > Did you run a make build with that smr_barrier() in it and checked that it
> > > does not cause a slow down? I am sceptical, smr_barrier() is a very slow
> > > construct which introduces large delays and should be avoided whenever
> > > possible.
> > 
> > I did build GENERIC.MP multiple times on a 4CPU sparc64 with the diff
> > below, without noticeable difference.
> > 
> > I'm happy to hear from sceptical performance checkers :o)
> 
> On a reasonably fast amd64 box, this increases GENERIC.MP make -j6 build
> time from ~3m06s to ~3m44s, which seems a bit much to me.

The more CPU the worse smr_barrier() behaves since it will require the
callback to be run on all cpus one after the other. This is why
smr_barrier() should not be used in any performance critical path.
 
> Replacing smr_barrier() with smr_flush() reduces the overhead to a couple of
> seconds, and it seems warranted here.

I still think that using smr_call() here is the best approach. I will dig
out my diff once mpi@ committed his tailq conversion.
 
> > 
> > diff --git lib/libkvm/kvm_proc2.c lib/libkvm/kvm_proc2.c
> > index 96f7dc91b92..1f4f9b914bb 100644
> > --- lib/libkvm/kvm_proc2.c
> > +++ lib/libkvm/kvm_proc2.c
> > @@ -341,8 +341,9 @@ kvm_proclist(kvm_t *kd, int op, int arg, struct process 
> > *pr,
> > kp.p_pctcpu = 0;
> > kp.p_stat = (process.ps_flags & PS_ZOMBIE) ? SDEAD :
> > SIDL;
> > -   for (p = TAILQ_FIRST(_threads); p != NULL; 
> > -   p = TAILQ_NEXT(, p_thr_link)) {
> > +   for (p = SMR_TAILQ_FIRST_LOCKED(_threads);
> > +   p != NULL;
> > +   p = SMR_TAILQ_NEXT_LOCKED(, p_thr_link)) {
> > if (KREAD(kd, (u_long)p, )) {
> > _kvm_err(kd, kd->program,
> > "can't read proc at %lx",
> > @@ -376,8 +377,8 @@ kvm_proclist(kvm_t *kd, int op, int arg, struct process 
> > *pr,
> > if (!dothreads)
> > continue;
> >  
> > -   for (p = TAILQ_FIRST(_threads); p != NULL; 
> > -   p = TAILQ_NEXT(, p_thr_link)) {
> > +   for (p = SMR_TAILQ_FIRST_LOCKED(_threads); p != NULL;
> > +   p = SMR_TAILQ_NEXT_LOCKED(, p_thr_link)) {
> > if (KREAD(kd, (u_long)p, )) {
> > _kvm_err(kd, kd->program,
> > "can't read proc at %lx",
> > diff --git sys/kern/exec_elf.c sys/kern/exec_elf.c
> > index 5e455208663..575273b306c 100644
> > --- sys/kern/exec_elf.c
> > +++ sys/kern/exec_elf.c
> > @@ -85,6 +85,7 @@
> >  #include 
> >  #include 
> >  #include 
> > +#include 
> >  
> >  #include 
> >  
> > @@ -1360,7 +1361,7 @@ coredump_notes_elf(struct proc *p, void *iocookie, 
> > size_t *sizep)
> >  * threads in the process have been stopped and the list can't
> >  * change.
> >  */
> > -   TAILQ_FOREACH(q, >ps_threads, p_thr_link) {
> > +   SMR_TAILQ_FOREACH_LOCKED(q, >ps_threads, p_thr_link) {
> > if (q == p) /* we've taken care of this thread */
> > continue;
> > error = coredump_note_elf(q, iocookie, );
> > diff --git sys/kern/init_main.c sys/kern/init_main.c
> > index fed6be19435..2b657ffe328 100644
> > --- sys/kern/init_main.c
> > +++ sys/kern/init_main.c
> > @@ -519,7 +519,7 @@ main(void *framep)
> >  */
> > LIST_FOREACH(pr, , ps_list) {
> > nanouptime(>ps_start);
> > -   TAILQ_FOREACH(p, >ps_threads, p_thr_link) {
> > +   SMR_TAILQ_FOREACH_LOCKED(p, >ps_threads, p_thr_link) {
> > nanouptime(>p_cpu->ci_schedstate.spc_runtime);
> > timespecclear(>p_rtime);
> > }
> > diff --git sys/kern/kern_exit.c sys/kern/kern_exit.c
> > index a20775419e3..3c526ab83b8 100644
> > --- sys/kern/kern_exit.c
> > +++ sys/kern/kern_exit.c
> > @@ -63,6 +63,7 @@
> >  #ifdef SYSVSEM
> >  #include 
> >  #endif
> > +#include 
> >  #include 
> >  
> >  #include 
> > @@ -161,7 +162,8 @@ exit1(struct proc *p, int xexit, int xsig, int flags)
> > }
> >  
> > /* unlink ourselves from the active threads */
> > -   TAILQ_REMOVE(>ps_threads, p, p_thr_link);
> > +   SMR_TAILQ_REMOVE_LOCKED(>ps_threads, p, p_thr_link);
> > +   smr_barrier();
> > if ((p->p_flag & P_THREAD) == 0) {
> > /* main thread gotta wait because it has the pid, et al */
> > while (pr->ps_refcnt > 1)
> > @@ -724,7 +726,7 @@ process_zap(struct process *pr)
> > if (pr->ps_ptstat != NULL)
> > free(pr->ps_ptstat, M_SUBPROC, sizeof(*pr->ps_ptstat));
> > pool_put(_pool, pr->ps_ru);
> > -   KASSERT(TAILQ_EMPTY(>ps_threads));
> > +   

Re: Use SMR_TAILQ for `ps_threads'

2020-12-02 Thread Philip Guenther
On Tue, Dec 1, 2020 at 5:48 AM Martin Pieuchot  wrote:
...

> exit1() for a multi-threaded process does the following:
>
> atomic_setbits_int(>p_flag, P_WEXIT);
> single_thread_check(p, 0);  // directly or via single_thread_set()
> SMR_TAILQ_REMOVE_LOCKED(>ps_threads, p, p_thr_link);
>
> Note that exit1() can't be called in parallel.
>

Well, multiple threads in a process can easily hit exit1() practically
concurrently, under the limitations of the kernel lock.  If anything in
exit1() sleeps, it has to be ready for some other thread to start into
exit1()!


If exit1() is called with EXIT_NORMAL, the current thread will start by
> setting `ps_single' then iterate over `ps_thread'.  This is done before
> updating `ps_thread' so there's no race in that case.
>
> If exit1() is called with EXIT_THREAD, the current thread might end up
> removing itself from `ps_thread' while another one is iterating over it.
> Since we're currently concerned about the iteration in single_thread_set()
> notice that `P_WEXIT' is checked first.  So if my understanding is correct
> is should be enough to do the following:
>
> SMR_TAILQ_REMOVE_LOCKED(>ps_threads, p, p_thr_link);
> smr_barrier();
>
> That would delays exit1() a bit but doesn't involve callback which is
> IMHO simpler.
>
> Did I miss anything?
>

What does it take to be sure that an "is empty" (or "only contains one")
test is _really_ true with SMR?

Philip


Re: Use SMR_TAILQ for `ps_threads'

2020-12-01 Thread Jonathan Matthew
On Tue, Dec 01, 2020 at 02:35:18PM -0300, Martin Pieuchot wrote:
> On 01/12/20(Tue) 15:30, Claudio Jeker wrote:
> > [...] 
> > Did you run a make build with that smr_barrier() in it and checked that it
> > does not cause a slow down? I am sceptical, smr_barrier() is a very slow
> > construct which introduces large delays and should be avoided whenever
> > possible.
> 
> I did build GENERIC.MP multiple times on a 4CPU sparc64 with the diff
> below, without noticeable difference.
> 
> I'm happy to hear from sceptical performance checkers :o)

On a reasonably fast amd64 box, this increases GENERIC.MP make -j6 build time 
from
~3m06s to ~3m44s, which seems a bit much to me.

Replacing smr_barrier() with smr_flush() reduces the overhead to a couple of
seconds, and it seems warranted here.

> 
> diff --git lib/libkvm/kvm_proc2.c lib/libkvm/kvm_proc2.c
> index 96f7dc91b92..1f4f9b914bb 100644
> --- lib/libkvm/kvm_proc2.c
> +++ lib/libkvm/kvm_proc2.c
> @@ -341,8 +341,9 @@ kvm_proclist(kvm_t *kd, int op, int arg, struct process 
> *pr,
>   kp.p_pctcpu = 0;
>   kp.p_stat = (process.ps_flags & PS_ZOMBIE) ? SDEAD :
>   SIDL;
> - for (p = TAILQ_FIRST(_threads); p != NULL; 
> - p = TAILQ_NEXT(, p_thr_link)) {
> + for (p = SMR_TAILQ_FIRST_LOCKED(_threads);
> + p != NULL;
> + p = SMR_TAILQ_NEXT_LOCKED(, p_thr_link)) {
>   if (KREAD(kd, (u_long)p, )) {
>   _kvm_err(kd, kd->program,
>   "can't read proc at %lx",
> @@ -376,8 +377,8 @@ kvm_proclist(kvm_t *kd, int op, int arg, struct process 
> *pr,
>   if (!dothreads)
>   continue;
>  
> - for (p = TAILQ_FIRST(_threads); p != NULL; 
> - p = TAILQ_NEXT(, p_thr_link)) {
> + for (p = SMR_TAILQ_FIRST_LOCKED(_threads); p != NULL;
> + p = SMR_TAILQ_NEXT_LOCKED(, p_thr_link)) {
>   if (KREAD(kd, (u_long)p, )) {
>   _kvm_err(kd, kd->program,
>   "can't read proc at %lx",
> diff --git sys/kern/exec_elf.c sys/kern/exec_elf.c
> index 5e455208663..575273b306c 100644
> --- sys/kern/exec_elf.c
> +++ sys/kern/exec_elf.c
> @@ -85,6 +85,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include 
>  
> @@ -1360,7 +1361,7 @@ coredump_notes_elf(struct proc *p, void *iocookie, 
> size_t *sizep)
>* threads in the process have been stopped and the list can't
>* change.
>*/
> - TAILQ_FOREACH(q, >ps_threads, p_thr_link) {
> + SMR_TAILQ_FOREACH_LOCKED(q, >ps_threads, p_thr_link) {
>   if (q == p) /* we've taken care of this thread */
>   continue;
>   error = coredump_note_elf(q, iocookie, );
> diff --git sys/kern/init_main.c sys/kern/init_main.c
> index fed6be19435..2b657ffe328 100644
> --- sys/kern/init_main.c
> +++ sys/kern/init_main.c
> @@ -519,7 +519,7 @@ main(void *framep)
>*/
>   LIST_FOREACH(pr, , ps_list) {
>   nanouptime(>ps_start);
> - TAILQ_FOREACH(p, >ps_threads, p_thr_link) {
> + SMR_TAILQ_FOREACH_LOCKED(p, >ps_threads, p_thr_link) {
>   nanouptime(>p_cpu->ci_schedstate.spc_runtime);
>   timespecclear(>p_rtime);
>   }
> diff --git sys/kern/kern_exit.c sys/kern/kern_exit.c
> index a20775419e3..3c526ab83b8 100644
> --- sys/kern/kern_exit.c
> +++ sys/kern/kern_exit.c
> @@ -63,6 +63,7 @@
>  #ifdef SYSVSEM
>  #include 
>  #endif
> +#include 
>  #include 
>  
>  #include 
> @@ -161,7 +162,8 @@ exit1(struct proc *p, int xexit, int xsig, int flags)
>   }
>  
>   /* unlink ourselves from the active threads */
> - TAILQ_REMOVE(>ps_threads, p, p_thr_link);
> + SMR_TAILQ_REMOVE_LOCKED(>ps_threads, p, p_thr_link);
> + smr_barrier();
>   if ((p->p_flag & P_THREAD) == 0) {
>   /* main thread gotta wait because it has the pid, et al */
>   while (pr->ps_refcnt > 1)
> @@ -724,7 +726,7 @@ process_zap(struct process *pr)
>   if (pr->ps_ptstat != NULL)
>   free(pr->ps_ptstat, M_SUBPROC, sizeof(*pr->ps_ptstat));
>   pool_put(_pool, pr->ps_ru);
> - KASSERT(TAILQ_EMPTY(>ps_threads));
> + KASSERT(SMR_TAILQ_EMPTY_LOCKED(>ps_threads));
>   lim_free(pr->ps_limit);
>   crfree(pr->ps_ucred);
>   pool_put(_pool, pr);
> diff --git sys/kern/kern_fork.c sys/kern/kern_fork.c
> index 9fb239bc8b4..e1cb587b2b8 100644
> --- sys/kern/kern_fork.c
> +++ sys/kern/kern_fork.c
> @@ -52,6 +52,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -179,8 +180,8 @@ process_initialize(struct process *pr, struct proc *p)
>  {
>   /* initialize the thread links */

Re: Use SMR_TAILQ for `ps_threads'

2020-12-01 Thread Martin Pieuchot
On 01/12/20(Tue) 15:30, Claudio Jeker wrote:
> [...] 
> Did you run a make build with that smr_barrier() in it and checked that it
> does not cause a slow down? I am sceptical, smr_barrier() is a very slow
> construct which introduces large delays and should be avoided whenever
> possible.

I did build GENERIC.MP multiple times on a 4CPU sparc64 with the diff
below, without noticeable difference.

I'm happy to hear from sceptical performance checkers :o)

diff --git lib/libkvm/kvm_proc2.c lib/libkvm/kvm_proc2.c
index 96f7dc91b92..1f4f9b914bb 100644
--- lib/libkvm/kvm_proc2.c
+++ lib/libkvm/kvm_proc2.c
@@ -341,8 +341,9 @@ kvm_proclist(kvm_t *kd, int op, int arg, struct process *pr,
kp.p_pctcpu = 0;
kp.p_stat = (process.ps_flags & PS_ZOMBIE) ? SDEAD :
SIDL;
-   for (p = TAILQ_FIRST(_threads); p != NULL; 
-   p = TAILQ_NEXT(, p_thr_link)) {
+   for (p = SMR_TAILQ_FIRST_LOCKED(_threads);
+   p != NULL;
+   p = SMR_TAILQ_NEXT_LOCKED(, p_thr_link)) {
if (KREAD(kd, (u_long)p, )) {
_kvm_err(kd, kd->program,
"can't read proc at %lx",
@@ -376,8 +377,8 @@ kvm_proclist(kvm_t *kd, int op, int arg, struct process *pr,
if (!dothreads)
continue;
 
-   for (p = TAILQ_FIRST(_threads); p != NULL; 
-   p = TAILQ_NEXT(, p_thr_link)) {
+   for (p = SMR_TAILQ_FIRST_LOCKED(_threads); p != NULL;
+   p = SMR_TAILQ_NEXT_LOCKED(, p_thr_link)) {
if (KREAD(kd, (u_long)p, )) {
_kvm_err(kd, kd->program,
"can't read proc at %lx",
diff --git sys/kern/exec_elf.c sys/kern/exec_elf.c
index 5e455208663..575273b306c 100644
--- sys/kern/exec_elf.c
+++ sys/kern/exec_elf.c
@@ -85,6 +85,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -1360,7 +1361,7 @@ coredump_notes_elf(struct proc *p, void *iocookie, size_t 
*sizep)
 * threads in the process have been stopped and the list can't
 * change.
 */
-   TAILQ_FOREACH(q, >ps_threads, p_thr_link) {
+   SMR_TAILQ_FOREACH_LOCKED(q, >ps_threads, p_thr_link) {
if (q == p) /* we've taken care of this thread */
continue;
error = coredump_note_elf(q, iocookie, );
diff --git sys/kern/init_main.c sys/kern/init_main.c
index fed6be19435..2b657ffe328 100644
--- sys/kern/init_main.c
+++ sys/kern/init_main.c
@@ -519,7 +519,7 @@ main(void *framep)
 */
LIST_FOREACH(pr, , ps_list) {
nanouptime(>ps_start);
-   TAILQ_FOREACH(p, >ps_threads, p_thr_link) {
+   SMR_TAILQ_FOREACH_LOCKED(p, >ps_threads, p_thr_link) {
nanouptime(>p_cpu->ci_schedstate.spc_runtime);
timespecclear(>p_rtime);
}
diff --git sys/kern/kern_exit.c sys/kern/kern_exit.c
index a20775419e3..3c526ab83b8 100644
--- sys/kern/kern_exit.c
+++ sys/kern/kern_exit.c
@@ -63,6 +63,7 @@
 #ifdef SYSVSEM
 #include 
 #endif
+#include 
 #include 
 
 #include 
@@ -161,7 +162,8 @@ exit1(struct proc *p, int xexit, int xsig, int flags)
}
 
/* unlink ourselves from the active threads */
-   TAILQ_REMOVE(>ps_threads, p, p_thr_link);
+   SMR_TAILQ_REMOVE_LOCKED(>ps_threads, p, p_thr_link);
+   smr_barrier();
if ((p->p_flag & P_THREAD) == 0) {
/* main thread gotta wait because it has the pid, et al */
while (pr->ps_refcnt > 1)
@@ -724,7 +726,7 @@ process_zap(struct process *pr)
if (pr->ps_ptstat != NULL)
free(pr->ps_ptstat, M_SUBPROC, sizeof(*pr->ps_ptstat));
pool_put(_pool, pr->ps_ru);
-   KASSERT(TAILQ_EMPTY(>ps_threads));
+   KASSERT(SMR_TAILQ_EMPTY_LOCKED(>ps_threads));
lim_free(pr->ps_limit);
crfree(pr->ps_ucred);
pool_put(_pool, pr);
diff --git sys/kern/kern_fork.c sys/kern/kern_fork.c
index 9fb239bc8b4..e1cb587b2b8 100644
--- sys/kern/kern_fork.c
+++ sys/kern/kern_fork.c
@@ -52,6 +52,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -179,8 +180,8 @@ process_initialize(struct process *pr, struct proc *p)
 {
/* initialize the thread links */
pr->ps_mainproc = p;
-   TAILQ_INIT(>ps_threads);
-   TAILQ_INSERT_TAIL(>ps_threads, p, p_thr_link);
+   SMR_TAILQ_INIT(>ps_threads);
+   SMR_TAILQ_INSERT_TAIL_LOCKED(>ps_threads, p, p_thr_link);
pr->ps_refcnt = 1;
p->p_p = pr;
 
@@ -557,7 +558,7 @@ thread_fork(struct proc *curp, void *stack, void *tcb, 
pid_t *tidptr,
 
LIST_INSERT_HEAD(, p, p_list);
LIST_INSERT_HEAD(TIDHASH(p->p_tid), p, 

Re: Use SMR_TAILQ for `ps_threads'

2020-12-01 Thread Claudio Jeker
On Tue, Dec 01, 2020 at 10:46:00AM -0300, Martin Pieuchot wrote:
> On 01/12/20(Tue) 21:47, Jonathan Matthew wrote:
> > On Tue, Dec 01, 2020 at 10:31:43AM +0100, Claudio Jeker wrote:
> > > On Mon, Nov 30, 2020 at 07:10:47PM -0300, Martin Pieuchot wrote:
> > > > Every multi-threaded process keeps a list of threads in `ps_threads'.
> > > > This list is iterated in interrupt and process context which makes it
> > > > complicated to protect it with a rwlock.
> > > > 
> > > > One of the places where such iteration is done is inside the tsleep(9)
> > > > routines, directly in single_thread_check() or via CURSIG().  In order
> > > > to take this code path out of the KERNEL_LOCK(), claudio@ proposed to
> > > > use SMR_TAILQ.  This has the advantage of not introducing lock
> > > > dependencies and allow us to address every iteration one-by-one.
> > > > 
> > > > Diff below is a first step into this direction, it replaces the existing
> > > > TAILQ_* macros by the locked version of SMR_TAILQ*.  This is mostly 
> > > > lifted
> > > > from claudio@'s diff and should not introduce any side effect.
> > > > 
> > > > ok?
> > > > 
> > > > diff --git sys/uvm/uvm_glue.c sys/uvm/uvm_glue.c
> > > > index 390307c4c81..40a10e4c1c5 100644
> > > > --- sys/uvm/uvm_glue.c
> > > > +++ sys/uvm/uvm_glue.c
> > > > @@ -369,7 +369,7 @@ uvm_swapout_threads(void)
> > > >  * the smallest p_slptime
> > > >  */
> > > > slpp = NULL;
> > > > -   TAILQ_FOREACH(p, >ps_threads, p_thr_link) {
> > > > +   SMR_TAILQ_FOREACH_LOCKED(p, >ps_threads, 
> > > > p_thr_link) {
> > > > switch (p->p_stat) {
> > > > case SRUN:
> > > > case SONPROC:
> > > > 
> > > 
> > > Why did you not include the smr_call() to safely free struct proc in this
> > > diff?
> 
> So we can discuss it separately,  I'm happy to do it now :)
> 
> > I was wondering about this too.  Freeing the struct proc is already delayed
> > by some amount since it happens in the reaper or in the parent process, does
> > it make sense to combine that with the SMR wait?
> 
> exit1() for a multi-threaded process does the following:
> 
>   atomic_setbits_int(>p_flag, P_WEXIT);
> 
>   single_thread_check(p, 0);  // directly or via single_thread_set()
> 
>   SMR_TAILQ_REMOVE_LOCKED(>ps_threads, p, p_thr_link);
> 
> 
> Note that exit1() can't be called in parallel.
> 
> If exit1() is called with EXIT_NORMAL, the current thread will start by
> setting `ps_single' then iterate over `ps_thread'.  This is done before
> updating `ps_thread' so there's no race in that case.
> 
> If exit1() is called with EXIT_THREAD, the current thread might end up
> removing itself from `ps_thread' while another one is iterating over it.
> Since we're currently concerned about the iteration in single_thread_set()
> notice that `P_WEXIT' is checked first.  So if my understanding is correct
> is should be enough to do the following:
> 
>   SMR_TAILQ_REMOVE_LOCKED(>ps_threads, p, p_thr_link);
>   smr_barrier();
> 
> That would delays exit1() a bit but doesn't involve callback which is
> IMHO simpler.
> 
> Did I miss anything?

Did you run a make build with that smr_barrier() in it and checked that it
does not cause a slow down? I am sceptical, smr_barrier() is a very slow
construct which introduces large delays and should be avoided whenever
possible.

-- 
:wq Claudio



Re: Use SMR_TAILQ for `ps_threads'

2020-12-01 Thread Martin Pieuchot
On 01/12/20(Tue) 21:47, Jonathan Matthew wrote:
> On Tue, Dec 01, 2020 at 10:31:43AM +0100, Claudio Jeker wrote:
> > On Mon, Nov 30, 2020 at 07:10:47PM -0300, Martin Pieuchot wrote:
> > > Every multi-threaded process keeps a list of threads in `ps_threads'.
> > > This list is iterated in interrupt and process context which makes it
> > > complicated to protect it with a rwlock.
> > > 
> > > One of the places where such iteration is done is inside the tsleep(9)
> > > routines, directly in single_thread_check() or via CURSIG().  In order
> > > to take this code path out of the KERNEL_LOCK(), claudio@ proposed to
> > > use SMR_TAILQ.  This has the advantage of not introducing lock
> > > dependencies and allow us to address every iteration one-by-one.
> > > 
> > > Diff below is a first step into this direction, it replaces the existing
> > > TAILQ_* macros by the locked version of SMR_TAILQ*.  This is mostly lifted
> > > from claudio@'s diff and should not introduce any side effect.
> > > 
> > > ok?
> > > 
> > > diff --git sys/uvm/uvm_glue.c sys/uvm/uvm_glue.c
> > > index 390307c4c81..40a10e4c1c5 100644
> > > --- sys/uvm/uvm_glue.c
> > > +++ sys/uvm/uvm_glue.c
> > > @@ -369,7 +369,7 @@ uvm_swapout_threads(void)
> > >* the smallest p_slptime
> > >*/
> > >   slpp = NULL;
> > > - TAILQ_FOREACH(p, >ps_threads, p_thr_link) {
> > > + SMR_TAILQ_FOREACH_LOCKED(p, >ps_threads, p_thr_link) {
> > >   switch (p->p_stat) {
> > >   case SRUN:
> > >   case SONPROC:
> > > 
> > 
> > Why did you not include the smr_call() to safely free struct proc in this
> > diff?

So we can discuss it separately,  I'm happy to do it now :)

> I was wondering about this too.  Freeing the struct proc is already delayed
> by some amount since it happens in the reaper or in the parent process, does
> it make sense to combine that with the SMR wait?

exit1() for a multi-threaded process does the following:

atomic_setbits_int(>p_flag, P_WEXIT);

single_thread_check(p, 0);  // directly or via single_thread_set()

SMR_TAILQ_REMOVE_LOCKED(>ps_threads, p, p_thr_link);


Note that exit1() can't be called in parallel.

If exit1() is called with EXIT_NORMAL, the current thread will start by
setting `ps_single' then iterate over `ps_thread'.  This is done before
updating `ps_thread' so there's no race in that case.

If exit1() is called with EXIT_THREAD, the current thread might end up
removing itself from `ps_thread' while another one is iterating over it.
Since we're currently concerned about the iteration in single_thread_set()
notice that `P_WEXIT' is checked first.  So if my understanding is correct
is should be enough to do the following:

SMR_TAILQ_REMOVE_LOCKED(>ps_threads, p, p_thr_link);
smr_barrier();

That would delays exit1() a bit but doesn't involve callback which is
IMHO simpler.

Did I miss anything?



Re: Use SMR_TAILQ for `ps_threads'

2020-12-01 Thread Claudio Jeker
On Tue, Dec 01, 2020 at 09:47:35PM +1000, Jonathan Matthew wrote:
> On Tue, Dec 01, 2020 at 10:31:43AM +0100, Claudio Jeker wrote:
> > On Mon, Nov 30, 2020 at 07:10:47PM -0300, Martin Pieuchot wrote:
> > > Every multi-threaded process keeps a list of threads in `ps_threads'.
> > > This list is iterated in interrupt and process context which makes it
> > > complicated to protect it with a rwlock.
> > > 
> > > One of the places where such iteration is done is inside the tsleep(9)
> > > routines, directly in single_thread_check() or via CURSIG().  In order
> > > to take this code path out of the KERNEL_LOCK(), claudio@ proposed to
> > > use SMR_TAILQ.  This has the advantage of not introducing lock
> > > dependencies and allow us to address every iteration one-by-one.
> > > 
> > > Diff below is a first step into this direction, it replaces the existing
> > > TAILQ_* macros by the locked version of SMR_TAILQ*.  This is mostly lifted
> > > from claudio@'s diff and should not introduce any side effect.
> > > 
> > > ok?
> > > 
> > > diff --git sys/uvm/uvm_glue.c sys/uvm/uvm_glue.c
> > > index 390307c4c81..40a10e4c1c5 100644
> > > --- sys/uvm/uvm_glue.c
> > > +++ sys/uvm/uvm_glue.c
> > > @@ -369,7 +369,7 @@ uvm_swapout_threads(void)
> > >* the smallest p_slptime
> > >*/
> > >   slpp = NULL;
> > > - TAILQ_FOREACH(p, >ps_threads, p_thr_link) {
> > > + SMR_TAILQ_FOREACH_LOCKED(p, >ps_threads, p_thr_link) {
> > >   switch (p->p_stat) {
> > >   case SRUN:
> > >   case SONPROC:
> > > 
> > 
> > Why did you not include the smr_call() to safely free struct proc in this
> > diff?
> 
> I was wondering about this too.  Freeing the struct proc is already delayed
> by some amount since it happens in the reaper or in the parent process, does
> it make sense to combine that with the SMR wait?

My diff actually did that. My fear was that SMR callbacks can be very
slow and so combining the two was worthwhile. 

-- 
:wq Claudio



Re: Use SMR_TAILQ for `ps_threads'

2020-12-01 Thread Jonathan Matthew
On Tue, Dec 01, 2020 at 10:31:43AM +0100, Claudio Jeker wrote:
> On Mon, Nov 30, 2020 at 07:10:47PM -0300, Martin Pieuchot wrote:
> > Every multi-threaded process keeps a list of threads in `ps_threads'.
> > This list is iterated in interrupt and process context which makes it
> > complicated to protect it with a rwlock.
> > 
> > One of the places where such iteration is done is inside the tsleep(9)
> > routines, directly in single_thread_check() or via CURSIG().  In order
> > to take this code path out of the KERNEL_LOCK(), claudio@ proposed to
> > use SMR_TAILQ.  This has the advantage of not introducing lock
> > dependencies and allow us to address every iteration one-by-one.
> > 
> > Diff below is a first step into this direction, it replaces the existing
> > TAILQ_* macros by the locked version of SMR_TAILQ*.  This is mostly lifted
> > from claudio@'s diff and should not introduce any side effect.
> > 
> > ok?
> > 
> > diff --git sys/uvm/uvm_glue.c sys/uvm/uvm_glue.c
> > index 390307c4c81..40a10e4c1c5 100644
> > --- sys/uvm/uvm_glue.c
> > +++ sys/uvm/uvm_glue.c
> > @@ -369,7 +369,7 @@ uvm_swapout_threads(void)
> >  * the smallest p_slptime
> >  */
> > slpp = NULL;
> > -   TAILQ_FOREACH(p, >ps_threads, p_thr_link) {
> > +   SMR_TAILQ_FOREACH_LOCKED(p, >ps_threads, p_thr_link) {
> > switch (p->p_stat) {
> > case SRUN:
> > case SONPROC:
> > 
> 
> Why did you not include the smr_call() to safely free struct proc in this
> diff?

I was wondering about this too.  Freeing the struct proc is already delayed
by some amount since it happens in the reaper or in the parent process, does
it make sense to combine that with the SMR wait?



Re: Use SMR_TAILQ for `ps_threads'

2020-12-01 Thread Claudio Jeker
On Mon, Nov 30, 2020 at 07:10:47PM -0300, Martin Pieuchot wrote:
> Every multi-threaded process keeps a list of threads in `ps_threads'.
> This list is iterated in interrupt and process context which makes it
> complicated to protect it with a rwlock.
> 
> One of the places where such iteration is done is inside the tsleep(9)
> routines, directly in single_thread_check() or via CURSIG().  In order
> to take this code path out of the KERNEL_LOCK(), claudio@ proposed to
> use SMR_TAILQ.  This has the advantage of not introducing lock
> dependencies and allow us to address every iteration one-by-one.
> 
> Diff below is a first step into this direction, it replaces the existing
> TAILQ_* macros by the locked version of SMR_TAILQ*.  This is mostly lifted
> from claudio@'s diff and should not introduce any side effect.
> 
> ok?
> 
> diff --git lib/libkvm/kvm_proc2.c lib/libkvm/kvm_proc2.c
> index 96f7dc91b92..1f4f9b914bb 100644
> --- lib/libkvm/kvm_proc2.c
> +++ lib/libkvm/kvm_proc2.c
> @@ -341,8 +341,9 @@ kvm_proclist(kvm_t *kd, int op, int arg, struct process 
> *pr,
>   kp.p_pctcpu = 0;
>   kp.p_stat = (process.ps_flags & PS_ZOMBIE) ? SDEAD :
>   SIDL;
> - for (p = TAILQ_FIRST(_threads); p != NULL; 
> - p = TAILQ_NEXT(, p_thr_link)) {
> + for (p = SMR_TAILQ_FIRST_LOCKED(_threads);
> + p != NULL;
> + p = SMR_TAILQ_NEXT_LOCKED(, p_thr_link)) {
>   if (KREAD(kd, (u_long)p, )) {
>   _kvm_err(kd, kd->program,
>   "can't read proc at %lx",
> @@ -376,8 +377,8 @@ kvm_proclist(kvm_t *kd, int op, int arg, struct process 
> *pr,
>   if (!dothreads)
>   continue;
>  
> - for (p = TAILQ_FIRST(_threads); p != NULL; 
> - p = TAILQ_NEXT(, p_thr_link)) {
> + for (p = SMR_TAILQ_FIRST_LOCKED(_threads); p != NULL;
> + p = SMR_TAILQ_NEXT_LOCKED(, p_thr_link)) {
>   if (KREAD(kd, (u_long)p, )) {
>   _kvm_err(kd, kd->program,
>   "can't read proc at %lx",
> diff --git sys/kern/exec_elf.c sys/kern/exec_elf.c
> index 5e455208663..575273b306c 100644
> --- sys/kern/exec_elf.c
> +++ sys/kern/exec_elf.c
> @@ -85,6 +85,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include 
>  
> @@ -1360,7 +1361,7 @@ coredump_notes_elf(struct proc *p, void *iocookie, 
> size_t *sizep)
>* threads in the process have been stopped and the list can't
>* change.
>*/
> - TAILQ_FOREACH(q, >ps_threads, p_thr_link) {
> + SMR_TAILQ_FOREACH_LOCKED(q, >ps_threads, p_thr_link) {
>   if (q == p) /* we've taken care of this thread */
>   continue;
>   error = coredump_note_elf(q, iocookie, );
> diff --git sys/kern/init_main.c sys/kern/init_main.c
> index fed6be19435..2b657ffe328 100644
> --- sys/kern/init_main.c
> +++ sys/kern/init_main.c
> @@ -519,7 +519,7 @@ main(void *framep)
>*/
>   LIST_FOREACH(pr, , ps_list) {
>   nanouptime(>ps_start);
> - TAILQ_FOREACH(p, >ps_threads, p_thr_link) {
> + SMR_TAILQ_FOREACH_LOCKED(p, >ps_threads, p_thr_link) {
>   nanouptime(>p_cpu->ci_schedstate.spc_runtime);
>   timespecclear(>p_rtime);
>   }
> diff --git sys/kern/kern_exit.c sys/kern/kern_exit.c
> index a20775419e3..ffc0158954c 100644
> --- sys/kern/kern_exit.c
> +++ sys/kern/kern_exit.c
> @@ -63,6 +63,7 @@
>  #ifdef SYSVSEM
>  #include 
>  #endif
> +#include 
>  #include 
>  
>  #include 
> @@ -161,7 +162,7 @@ exit1(struct proc *p, int xexit, int xsig, int flags)
>   }
>  
>   /* unlink ourselves from the active threads */
> - TAILQ_REMOVE(>ps_threads, p, p_thr_link);
> + SMR_TAILQ_REMOVE_LOCKED(>ps_threads, p, p_thr_link);
>   if ((p->p_flag & P_THREAD) == 0) {
>   /* main thread gotta wait because it has the pid, et al */
>   while (pr->ps_refcnt > 1)
> @@ -724,7 +725,7 @@ process_zap(struct process *pr)
>   if (pr->ps_ptstat != NULL)
>   free(pr->ps_ptstat, M_SUBPROC, sizeof(*pr->ps_ptstat));
>   pool_put(_pool, pr->ps_ru);
> - KASSERT(TAILQ_EMPTY(>ps_threads));
> + KASSERT(SMR_TAILQ_EMPTY_LOCKED(>ps_threads));
>   lim_free(pr->ps_limit);
>   crfree(pr->ps_ucred);
>   pool_put(_pool, pr);
> diff --git sys/kern/kern_fork.c sys/kern/kern_fork.c
> index 9fb239bc8b4..e1cb587b2b8 100644
> --- sys/kern/kern_fork.c
> +++ sys/kern/kern_fork.c
> @@ -52,6 +52,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -179,8 +180,8 @@ process_initialize(struct process *pr, struct proc *p)
>  {
>   /* initialize the 

Use SMR_TAILQ for `ps_threads'

2020-11-30 Thread Martin Pieuchot
Every multi-threaded process keeps a list of threads in `ps_threads'.
This list is iterated in interrupt and process context which makes it
complicated to protect it with a rwlock.

One of the places where such iteration is done is inside the tsleep(9)
routines, directly in single_thread_check() or via CURSIG().  In order
to take this code path out of the KERNEL_LOCK(), claudio@ proposed to
use SMR_TAILQ.  This has the advantage of not introducing lock
dependencies and allow us to address every iteration one-by-one.

Diff below is a first step into this direction, it replaces the existing
TAILQ_* macros by the locked version of SMR_TAILQ*.  This is mostly lifted
from claudio@'s diff and should not introduce any side effect.

ok?

diff --git lib/libkvm/kvm_proc2.c lib/libkvm/kvm_proc2.c
index 96f7dc91b92..1f4f9b914bb 100644
--- lib/libkvm/kvm_proc2.c
+++ lib/libkvm/kvm_proc2.c
@@ -341,8 +341,9 @@ kvm_proclist(kvm_t *kd, int op, int arg, struct process *pr,
kp.p_pctcpu = 0;
kp.p_stat = (process.ps_flags & PS_ZOMBIE) ? SDEAD :
SIDL;
-   for (p = TAILQ_FIRST(_threads); p != NULL; 
-   p = TAILQ_NEXT(, p_thr_link)) {
+   for (p = SMR_TAILQ_FIRST_LOCKED(_threads);
+   p != NULL;
+   p = SMR_TAILQ_NEXT_LOCKED(, p_thr_link)) {
if (KREAD(kd, (u_long)p, )) {
_kvm_err(kd, kd->program,
"can't read proc at %lx",
@@ -376,8 +377,8 @@ kvm_proclist(kvm_t *kd, int op, int arg, struct process *pr,
if (!dothreads)
continue;
 
-   for (p = TAILQ_FIRST(_threads); p != NULL; 
-   p = TAILQ_NEXT(, p_thr_link)) {
+   for (p = SMR_TAILQ_FIRST_LOCKED(_threads); p != NULL;
+   p = SMR_TAILQ_NEXT_LOCKED(, p_thr_link)) {
if (KREAD(kd, (u_long)p, )) {
_kvm_err(kd, kd->program,
"can't read proc at %lx",
diff --git sys/kern/exec_elf.c sys/kern/exec_elf.c
index 5e455208663..575273b306c 100644
--- sys/kern/exec_elf.c
+++ sys/kern/exec_elf.c
@@ -85,6 +85,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -1360,7 +1361,7 @@ coredump_notes_elf(struct proc *p, void *iocookie, size_t 
*sizep)
 * threads in the process have been stopped and the list can't
 * change.
 */
-   TAILQ_FOREACH(q, >ps_threads, p_thr_link) {
+   SMR_TAILQ_FOREACH_LOCKED(q, >ps_threads, p_thr_link) {
if (q == p) /* we've taken care of this thread */
continue;
error = coredump_note_elf(q, iocookie, );
diff --git sys/kern/init_main.c sys/kern/init_main.c
index fed6be19435..2b657ffe328 100644
--- sys/kern/init_main.c
+++ sys/kern/init_main.c
@@ -519,7 +519,7 @@ main(void *framep)
 */
LIST_FOREACH(pr, , ps_list) {
nanouptime(>ps_start);
-   TAILQ_FOREACH(p, >ps_threads, p_thr_link) {
+   SMR_TAILQ_FOREACH_LOCKED(p, >ps_threads, p_thr_link) {
nanouptime(>p_cpu->ci_schedstate.spc_runtime);
timespecclear(>p_rtime);
}
diff --git sys/kern/kern_exit.c sys/kern/kern_exit.c
index a20775419e3..ffc0158954c 100644
--- sys/kern/kern_exit.c
+++ sys/kern/kern_exit.c
@@ -63,6 +63,7 @@
 #ifdef SYSVSEM
 #include 
 #endif
+#include 
 #include 
 
 #include 
@@ -161,7 +162,7 @@ exit1(struct proc *p, int xexit, int xsig, int flags)
}
 
/* unlink ourselves from the active threads */
-   TAILQ_REMOVE(>ps_threads, p, p_thr_link);
+   SMR_TAILQ_REMOVE_LOCKED(>ps_threads, p, p_thr_link);
if ((p->p_flag & P_THREAD) == 0) {
/* main thread gotta wait because it has the pid, et al */
while (pr->ps_refcnt > 1)
@@ -724,7 +725,7 @@ process_zap(struct process *pr)
if (pr->ps_ptstat != NULL)
free(pr->ps_ptstat, M_SUBPROC, sizeof(*pr->ps_ptstat));
pool_put(_pool, pr->ps_ru);
-   KASSERT(TAILQ_EMPTY(>ps_threads));
+   KASSERT(SMR_TAILQ_EMPTY_LOCKED(>ps_threads));
lim_free(pr->ps_limit);
crfree(pr->ps_ucred);
pool_put(_pool, pr);
diff --git sys/kern/kern_fork.c sys/kern/kern_fork.c
index 9fb239bc8b4..e1cb587b2b8 100644
--- sys/kern/kern_fork.c
+++ sys/kern/kern_fork.c
@@ -52,6 +52,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -179,8 +180,8 @@ process_initialize(struct process *pr, struct proc *p)
 {
/* initialize the thread links */
pr->ps_mainproc = p;
-   TAILQ_INIT(>ps_threads);
-   TAILQ_INSERT_TAIL(>ps_threads, p, p_thr_link);
+   SMR_TAILQ_INIT(>ps_threads);
+