Re: threaded prof signals

2013-10-06 Thread Philip Guenther
On Sun, 6 Oct 2013, Joel Sing wrote:
...
 I like the concept of this diff, however it is not sufficient to solve 
 the multithreaded profiling problem. The reason for this is that you're 
 still sending SPROCESS signals, which are delivered to the main thread 
 unless another thread has diverted the signal. This means that all of 
 the profiling signals end up on the main thread, not the thread that was 
 consuming CPU time.
 
 This can be fixed by either using ptsignal(p, SIG{PROF,VTALARM}, 
 STHREAD) instead of psignal(), or by including your other diff that 
 changes the signal delivery within ptsignal - regardless, I think the 
 previous diff is worth pursuing since it potentially reduces the number 
 of signal related context switches.

Okay, here's a complete diff that, at least on my box with a regress test 
that jsing@ should be committing soon, solves the problem, reduces the 
total complexity of the itimer signal delivery, and reduces the context 
switches caused by signal by having a thread sending a signal to its own 
process take delivery of the signal if it can.

(While checking up on something else, I found that FreeBSD already uses 
this take the cannoli^Wsignal logic.)

ok?

Philip

Index: bin/ps/ps.1
===
RCS file: /cvs/src/bin/ps/ps.1,v
retrieving revision 1.84
diff -u -p -r1.84 ps.1
--- bin/ps/ps.1 22 Sep 2013 17:28:34 -  1.84
+++ bin/ps/ps.1 7 Oct 2013 02:08:32 -
@@ -222,6 +222,8 @@ The thread flags (in hexadecimal), as de
 .Aq Pa sys/proc.h :
 .Bd -literal
 P_INKTR   0x1 writing ktrace(2) record
+P_PROFPEND0x2 this thread needs SIGPROF
+P_ALRMPEND0x4 this thread needs SIGVTALRM
 P_SIGSUSPEND  0x8 need to restore before-suspend mask
 P_SELECT 0x40 selecting; wakeup/waiting danger
 P_SINTR  0x80 sleep is interruptible
Index: sys/sys/proc.h
===
RCS file: /cvs/src/sys/sys/proc.h,v
retrieving revision 1.170
diff -u -p -r1.170 proc.h
--- sys/sys/proc.h  22 Sep 2013 17:28:33 -  1.170
+++ sys/sys/proc.h  7 Oct 2013 02:08:35 -
@@ -209,8 +209,6 @@ struct process {
 
struct  timespec ps_start;  /* starting time. */
struct  timeout ps_realit_to;   /* real-time itimer trampoline. */
-   struct  timeout ps_virt_to; /* virtual itimer trampoline. */
-   struct  timeout ps_prof_to; /* prof itimer trampoline. */
 
int ps_refcnt;  /* Number of references. */
 };
@@ -362,6 +360,8 @@ struct proc {
  * These flags are per-thread and kept in p_flag
  */
 #defineP_INKTR 0x01/* In a ktrace op, don't 
recurse */
+#defineP_PROFPEND  0x02/* SIGPROF needs to be posted */
+#defineP_ALRMPEND  0x04/* SIGVTALRM needs to be posted 
*/
 #defineP_SIGSUSPEND0x08/* Need to restore 
before-suspend mask*/
 #defineP_SELECT0x40/* Selecting; wakeup/waiting 
danger. */
 #defineP_SINTR 0x80/* Sleep is interruptible. */
@@ -380,7 +380,8 @@ struct proc {
 #define P_CPUPEG   0x4000  /* Do not move to another cpu. */
 
 #defineP_BITS \
-(\20\01INKTR\04SIGSUSPEND\07SELECT\010SINTR\012SYSTEM \
+(\20\01INKTR\02PROFPEND\03ALRMPEND\04SIGSUSPEND\07SELECT \
+ \010SINTR\012SYSTEM \
  \013TIMEOUT\016WEXIT\020OWEUPC\024SUSPSINGLE \
  \025NOZOMBIE\027SYSTRACE\030CONTINUED\033THREAD \
  \034SUSPSIG\035SOFTDEP\036STOPPED\037CPUPEG)
Index: sys/sys/resourcevar.h
===
RCS file: /cvs/src/sys/sys/resourcevar.h,v
retrieving revision 1.16
diff -u -p -r1.16 resourcevar.h
--- sys/sys/resourcevar.h   3 Jun 2013 16:55:22 -   1.16
+++ sys/sys/resourcevar.h   7 Oct 2013 02:08:35 -
@@ -68,8 +68,5 @@ struct plimit *limcopy(struct plimit *);
 void   limfree(struct plimit *);
 
 voidruadd(struct rusage *, struct rusage *);
-
-void   virttimer_trampoline(void *);
-void   proftimer_trampoline(void *);
 #endif
 #endif /* !_SYS_RESOURCEVAR_H_ */
Index: sys/kern/kern_clock.c
===
RCS file: /cvs/src/sys/kern/kern_clock.c,v
retrieving revision 1.82
diff -u -p -r1.82 kern_clock.c
--- sys/kern/kern_clock.c   13 Aug 2013 05:52:23 -  1.82
+++ sys/kern/kern_clock.c   7 Oct 2013 02:08:37 -
@@ -144,40 +144,11 @@ initclocks(void)
 /*
  * hardclock does the accounting needed for ITIMER_PROF and ITIMER_VIRTUAL.
  * We don't want to send signals with psignal from hardclock because it makes
- * MULTIPROCESSOR locking very complicated. Instead we use a small trick
- * to send the signals safely and without blocking too many interrupts
- * while doing that (signal handling can be heavy).
- *
- * hardclock detects that the itimer has expired, and schedules a 

Re: threaded prof signals

2013-10-05 Thread Todd C. Miller
Setting a flag seems more straighforward than (ab)using timeouts
for this.  psignal() is a rather complicated function but I don't
see any problems offhand with running it from userret().

 - todd



Re: threaded prof signals

2013-10-05 Thread Mark Kettenis
 Date: Thu, 3 Oct 2013 18:12:59 -0700
 From: Philip Guenther guent...@gmail.com
 
 On Fri, 16 Aug 2013, Ted Unangst wrote:
  As per http://research.swtch.com/macpprof
  
  We deliver all prof signals to the main thread, which is unlikely to
  result in accurate profiling info. I think the diff below fixes things.
 
 How about we take an idea from FreeBSD and have hardclock() just set a 
 flag on the thread and then have the thread send SIGPROF (and SIGVTALRM) 
 to itself from userret().  The signal gets sent by the thread itself right 
 before it checks for pending signals when returning to userspace, so 
 that's absolutely as soon as possible, and it's done from process context 
 instead of from softclock by a timeout, so no which CPU are we on? 
 issues that might delay the delivery.

That seems like a sensible thing to do.  However...

 Index: sys/kern/kern_clock.c
 ===
 RCS file: /cvs/src/sys/kern/kern_clock.c,v
 retrieving revision 1.82
 diff -u -p -r1.82 kern_clock.c
 --- sys/kern/kern_clock.c 13 Aug 2013 05:52:23 -  1.82
 +++ sys/kern/kern_clock.c 4 Oct 2013 01:00:19 -
 @@ -144,40 +144,11 @@ initclocks(void)
  /*
   * hardclock does the accounting needed for ITIMER_PROF and ITIMER_VIRTUAL.
   * We don't want to send signals with psignal from hardclock because it makes
 - * MULTIPROCESSOR locking very complicated. Instead we use a small trick
 - * to send the signals safely and without blocking too many interrupts
 - * while doing that (signal handling can be heavy).
 - *
 - * hardclock detects that the itimer has expired, and schedules a timeout
 - * to deliver the signal. This works because of the following reasons:
 - *  - The timeout can be scheduled with a 1 tick time because we're
 - *doing it before the timeout processing in hardclock. So it will
 - *be scheduled to run as soon as possible.
 - *  - The timeout will be run in softclock which will run before we
 - *return to userland and process pending signals.
 - *  - If the system is so busy that several VIRTUAL/PROF ticks are
 - *sent before softclock processing, we'll send only one signal.
 - *But if we'd send the signal from hardclock only one signal would
 - *be delivered to the user process. So userland will only see one
 - *signal anyway.
 + * MULTIPROCESSOR locking very complicated. Instead, to use an idea from
 + * FreeBSD, we set a flag on the thread and when it goes to return to
 + * userspace it signals itself.
   */
  
 -void
 -virttimer_trampoline(void *v)
 -{
 - struct process *pr = v;
 -
 - psignal(pr-ps_mainproc, SIGVTALRM);
 -}
 -
 -void
 -proftimer_trampoline(void *v)
 -{
 - struct process *pr = v;
 -
 - psignal(pr-ps_mainproc, SIGPROF);
 -}
 -
  /*
   * The real-time timer, interrupting hz times per second.
   */
 @@ -196,11 +167,15 @@ hardclock(struct clockframe *frame)
*/
   if (CLKF_USERMODE(frame) 
   timerisset(pr-ps_timer[ITIMER_VIRTUAL].it_value) 
 - itimerdecr(pr-ps_timer[ITIMER_VIRTUAL], tick) == 0)
 - timeout_add(pr-ps_virt_to, 1);
 + itimerdecr(pr-ps_timer[ITIMER_VIRTUAL], tick) == 0) {
 + atomic_setbits_int(p-p_flag, P_ALRMPEND);
 + need_proftick(p);
 + }
   if (timerisset(pr-ps_timer[ITIMER_PROF].it_value) 
 - itimerdecr(pr-ps_timer[ITIMER_PROF], tick) == 0)
 - timeout_add(pr-ps_prof_to, 1);
 + itimerdecr(pr-ps_timer[ITIMER_PROF], tick) == 0) {
 + atomic_setbits_int(p-p_flag, P_PROFPEND);
 + need_proftick(p);
 + }

I don't quite understand why you added the need_proftick() calls here.



Re: threaded prof signals

2013-10-05 Thread Joel Sing
On Fri, 4 Oct 2013, Philip Guenther wrote:
 On Fri, 16 Aug 2013, Ted Unangst wrote:
  As per http://research.swtch.com/macpprof
 
  We deliver all prof signals to the main thread, which is unlikely to
  result in accurate profiling info. I think the diff below fixes things.

 How about we take an idea from FreeBSD and have hardclock() just set a
 flag on the thread and then have the thread send SIGPROF (and SIGVTALRM)
 to itself from userret().  The signal gets sent by the thread itself right
 before it checks for pending signals when returning to userspace, so
 that's absolutely as soon as possible, and it's done from process context
 instead of from softclock by a timeout, so no which CPU are we on?
 issues that might delay the delivery.

 Ted, Joel, does this solve your profiling problems?

Nope.

I like the concept of this diff, however it is not sufficient to solve the 
multithreaded profiling problem. The reason for this is that you're still 
sending SPROCESS signals, which are delivered to the main thread unless 
another thread has diverted the signal. This means that all of the profiling 
signals end up on the main thread, not the thread that was consuming CPU 
time.

This can be fixed by either using ptsignal(p, SIG{PROF,VTALARM}, STHREAD) 
instead of psignal(), or by including your other diff that changes the signal 
delivery within ptsignal - regardless, I think the previous diff is worth 
pursuing since it potentially reduces the number of signal related context 
switches.

P.S. The diff below misses a chunk from kern_time.c.

 (No, this had nothing at all to do with my splitting the process and
 thread flags, what could possibly make you think that...)


 Philip

 Index: sys/sys/proc.h
 ===
 RCS file: /cvs/src/sys/sys/proc.h,v
 retrieving revision 1.170
 diff -u -p -r1.170 proc.h
 --- sys/sys/proc.h22 Sep 2013 17:28:33 -  1.170
 +++ sys/sys/proc.h4 Oct 2013 01:00:18 -
 @@ -209,8 +209,6 @@ struct process {

   struct  timespec ps_start;  /* starting time. */
   struct  timeout ps_realit_to;   /* real-time itimer trampoline. */
 - struct  timeout ps_virt_to; /* virtual itimer trampoline. */
 - struct  timeout ps_prof_to; /* prof itimer trampoline. */

   int ps_refcnt;  /* Number of references. */
  };
 @@ -362,6 +361,8 @@ struct proc {
   * These flags are per-thread and kept in p_flag
   */
  #define  P_INKTR 0x01/* In a ktrace op, don't 
 recurse */
 +#define  P_PROFPEND  0x02/* SIGPROF needs to be posted */
 +#define  P_ALRMPEND  0x04/* SIGVTALRM needs to be posted 
 */
  #define  P_SIGSUSPEND0x08/* Need to restore 
 before-suspend mask*/
  #define  P_SELECT0x40/* Selecting; wakeup/waiting 
 danger. */
  #define  P_SINTR 0x80/* Sleep is interruptible. */
 @@ -380,7 +381,8 @@ struct proc {
  #define P_CPUPEG 0x4000  /* Do not move to another cpu. */

  #define  P_BITS \
 -(\20\01INKTR\04SIGSUSPEND\07SELECT\010SINTR\012SYSTEM \
 +(\20\01INKTR\02PROFPEND\03ALRMPEND\04SIGSUSPEND\07SELECT \
 + \010SINTR\012SYSTEM \
   \013TIMEOUT\016WEXIT\020OWEUPC\024SUSPSINGLE \
   \025NOZOMBIE\027SYSTRACE\030CONTINUED\033THREAD \
   \034SUSPSIG\035SOFTDEP\036STOPPED\037CPUPEG)
 Index: sys/sys/resourcevar.h
 ===
 RCS file: /cvs/src/sys/sys/resourcevar.h,v
 retrieving revision 1.16
 diff -u -p -r1.16 resourcevar.h
 --- sys/sys/resourcevar.h 3 Jun 2013 16:55:22 -   1.16
 +++ sys/sys/resourcevar.h 4 Oct 2013 01:00:18 -
 @@ -68,8 +68,5 @@ struct plimit *limcopy(struct plimit *);
  void limfree(struct plimit *);

  void  ruadd(struct rusage *, struct rusage *);
 -
 -void virttimer_trampoline(void *);
 -void proftimer_trampoline(void *);
  #endif
  #endif   /* !_SYS_RESOURCEVAR_H_ */
 Index: sys/kern/kern_clock.c
 ===
 RCS file: /cvs/src/sys/kern/kern_clock.c,v
 retrieving revision 1.82
 diff -u -p -r1.82 kern_clock.c
 --- sys/kern/kern_clock.c 13 Aug 2013 05:52:23 -  1.82
 +++ sys/kern/kern_clock.c 4 Oct 2013 01:00:19 -
 @@ -144,40 +144,11 @@ initclocks(void)
  /*
   * hardclock does the accounting needed for ITIMER_PROF and
 ITIMER_VIRTUAL. * We don't want to send signals with psignal from hardclock
 because it makes - * MULTIPROCESSOR locking very complicated. Instead we
 use a small trick - * to send the signals safely and without blocking too
 many interrupts - * while doing that (signal handling can be heavy).
 - *
 - * hardclock detects that the itimer has expired, and schedules a timeout
 - * to deliver the signal. This works because of the following reasons:
 - *  - The timeout can be scheduled with a 1 tick time because we're
 - *doing 

Re: threaded prof signals

2013-10-05 Thread Philip Guenther
On Sat, Oct 5, 2013 at 5:52 AM, Mark Kettenis mark.kette...@xs4all.nl wrote:
 Date: Thu, 3 Oct 2013 18:12:59 -0700
 From: Philip Guenther guent...@gmail.com
...
 @@ -196,11 +167,15 @@ hardclock(struct clockframe *frame)
*/
   if (CLKF_USERMODE(frame) 
   timerisset(pr-ps_timer[ITIMER_VIRTUAL].it_value) 
 - itimerdecr(pr-ps_timer[ITIMER_VIRTUAL], tick) == 0)
 - timeout_add(pr-ps_virt_to, 1);
 + itimerdecr(pr-ps_timer[ITIMER_VIRTUAL], tick) == 0) {
 + atomic_setbits_int(p-p_flag, P_ALRMPEND);
 + need_proftick(p);
 + }
   if (timerisset(pr-ps_timer[ITIMER_PROF].it_value) 
 - itimerdecr(pr-ps_timer[ITIMER_PROF], tick) == 0)
 - timeout_add(pr-ps_prof_to, 1);
 + itimerdecr(pr-ps_timer[ITIMER_PROF], tick) == 0) {
 + atomic_setbits_int(p-p_flag, P_PROFPEND);
 + need_proftick(p);
 + }

 I don't quite understand why you added the need_proftick() calls here.

userret() is not normally called when returning to userspace after a
random interrupt like the clock interrupt.  To force that and trigger
a call to trap() we need to set the MD AST flags which is checked from
the return-to-userspace code.  That's exactly what need_proftick()
does.

(We could eliminate need_proftick() and just call aston() directly if
the hp300 and mvme68k versions of aston() took an argument like the
others...)


Philip Guenther



Re: threaded prof signals

2013-10-04 Thread Ted Unangst
On Thu, Oct 03, 2013 at 18:12, Philip Guenther wrote:
 How about we take an idea from FreeBSD and have hardclock() just set a
 flag on the thread and then have the thread send SIGPROF (and SIGVTALRM)
 to itself from userret().  The signal gets sent by the thread itself right
 before it checks for pending signals when returning to userspace, so
 that's absolutely as soon as possible, and it's done from process context
 instead of from softclock by a timeout, so no which CPU are we on?
 issues that might delay the delivery.

I approve of all diffs that eliminate useless trampolines. I have no
idea if the diff works, but I will provisionally ok the idea.



Re: threaded prof signals

2013-10-03 Thread Philip Guenther
On Fri, 16 Aug 2013, Ted Unangst wrote:
 As per http://research.swtch.com/macpprof
 
 We deliver all prof signals to the main thread, which is unlikely to
 result in accurate profiling info. I think the diff below fixes things.

How about we take an idea from FreeBSD and have hardclock() just set a 
flag on the thread and then have the thread send SIGPROF (and SIGVTALRM) 
to itself from userret().  The signal gets sent by the thread itself right 
before it checks for pending signals when returning to userspace, so 
that's absolutely as soon as possible, and it's done from process context 
instead of from softclock by a timeout, so no which CPU are we on? 
issues that might delay the delivery.

Ted, Joel, does this solve your profiling problems?

(No, this had nothing at all to do with my splitting the process and 
thread flags, what could possibly make you think that...)


Philip

Index: sys/sys/proc.h
===
RCS file: /cvs/src/sys/sys/proc.h,v
retrieving revision 1.170
diff -u -p -r1.170 proc.h
--- sys/sys/proc.h  22 Sep 2013 17:28:33 -  1.170
+++ sys/sys/proc.h  4 Oct 2013 01:00:18 -
@@ -209,8 +209,6 @@ struct process {
 
struct  timespec ps_start;  /* starting time. */
struct  timeout ps_realit_to;   /* real-time itimer trampoline. */
-   struct  timeout ps_virt_to; /* virtual itimer trampoline. */
-   struct  timeout ps_prof_to; /* prof itimer trampoline. */
 
int ps_refcnt;  /* Number of references. */
 };
@@ -362,6 +361,8 @@ struct proc {
  * These flags are per-thread and kept in p_flag
  */
 #defineP_INKTR 0x01/* In a ktrace op, don't 
recurse */
+#defineP_PROFPEND  0x02/* SIGPROF needs to be posted */
+#defineP_ALRMPEND  0x04/* SIGVTALRM needs to be posted 
*/
 #defineP_SIGSUSPEND0x08/* Need to restore 
before-suspend mask*/
 #defineP_SELECT0x40/* Selecting; wakeup/waiting 
danger. */
 #defineP_SINTR 0x80/* Sleep is interruptible. */
@@ -380,7 +381,8 @@ struct proc {
 #define P_CPUPEG   0x4000  /* Do not move to another cpu. */
 
 #defineP_BITS \
-(\20\01INKTR\04SIGSUSPEND\07SELECT\010SINTR\012SYSTEM \
+(\20\01INKTR\02PROFPEND\03ALRMPEND\04SIGSUSPEND\07SELECT \
+ \010SINTR\012SYSTEM \
  \013TIMEOUT\016WEXIT\020OWEUPC\024SUSPSINGLE \
  \025NOZOMBIE\027SYSTRACE\030CONTINUED\033THREAD \
  \034SUSPSIG\035SOFTDEP\036STOPPED\037CPUPEG)
Index: sys/sys/resourcevar.h
===
RCS file: /cvs/src/sys/sys/resourcevar.h,v
retrieving revision 1.16
diff -u -p -r1.16 resourcevar.h
--- sys/sys/resourcevar.h   3 Jun 2013 16:55:22 -   1.16
+++ sys/sys/resourcevar.h   4 Oct 2013 01:00:18 -
@@ -68,8 +68,5 @@ struct plimit *limcopy(struct plimit *);
 void   limfree(struct plimit *);
 
 voidruadd(struct rusage *, struct rusage *);
-
-void   virttimer_trampoline(void *);
-void   proftimer_trampoline(void *);
 #endif
 #endif /* !_SYS_RESOURCEVAR_H_ */
Index: sys/kern/kern_clock.c
===
RCS file: /cvs/src/sys/kern/kern_clock.c,v
retrieving revision 1.82
diff -u -p -r1.82 kern_clock.c
--- sys/kern/kern_clock.c   13 Aug 2013 05:52:23 -  1.82
+++ sys/kern/kern_clock.c   4 Oct 2013 01:00:19 -
@@ -144,40 +144,11 @@ initclocks(void)
 /*
  * hardclock does the accounting needed for ITIMER_PROF and ITIMER_VIRTUAL.
  * We don't want to send signals with psignal from hardclock because it makes
- * MULTIPROCESSOR locking very complicated. Instead we use a small trick
- * to send the signals safely and without blocking too many interrupts
- * while doing that (signal handling can be heavy).
- *
- * hardclock detects that the itimer has expired, and schedules a timeout
- * to deliver the signal. This works because of the following reasons:
- *  - The timeout can be scheduled with a 1 tick time because we're
- *doing it before the timeout processing in hardclock. So it will
- *be scheduled to run as soon as possible.
- *  - The timeout will be run in softclock which will run before we
- *return to userland and process pending signals.
- *  - If the system is so busy that several VIRTUAL/PROF ticks are
- *sent before softclock processing, we'll send only one signal.
- *But if we'd send the signal from hardclock only one signal would
- *be delivered to the user process. So userland will only see one
- *signal anyway.
+ * MULTIPROCESSOR locking very complicated. Instead, to use an idea from
+ * FreeBSD, we set a flag on the thread and when it goes to return to
+ * userspace it signals itself.
  */
 
-void
-virttimer_trampoline(void *v)
-{
-   struct process *pr = v;
-
-   psignal(pr-ps_mainproc, SIGVTALRM);

Re: threaded prof signals

2013-08-28 Thread Ted Unangst
On Fri, Aug 16, 2013 at 02:12, Ted Unangst wrote:
 As per http://research.swtch.com/macpprof
 
 We deliver all prof signals to the main thread, which is unlikely to
 result in accurate profiling info. I think the diff below fixes things.

floating this again. note that it's not the clock being made per
thread, just the timeout, which is only used to move signal delivery
from hardclock to softclock.

 
 Index: kern/kern_clock.c
 ===
 RCS file: /cvs/src/sys/kern/kern_clock.c,v
 retrieving revision 1.81
 diff -u -p -r1.81 kern_clock.c
 --- kern/kern_clock.c 24 Apr 2013 17:29:02 -  1.81
 +++ kern/kern_clock.c 16 Aug 2013 05:57:27 -
 @@ -173,9 +173,9 @@ virttimer_trampoline(void *v)
 void
 proftimer_trampoline(void *v)
 {
 - struct process *pr = v;
 + struct proc *p = v;
 
 - psignal(pr-ps_mainproc, SIGPROF);
 + psignal(p, SIGPROF);
 }
 
 /*
 @@ -200,7 +200,7 @@ hardclock(struct clockframe *frame)
 timeout_add(pr-ps_virt_to, 1);
 if (timerisset(pr-ps_timer[ITIMER_PROF].it_value) 
 itimerdecr(pr-ps_timer[ITIMER_PROF], tick) == 0)
 - timeout_add(pr-ps_prof_to, 1);
 + timeout_add(p-p_prof_to, 1);
 }
 
 /*
 Index: kern/kern_exit.c
 ===
 RCS file: /cvs/src/sys/kern/kern_exit.c,v
 retrieving revision 1.125
 diff -u -p -r1.125 kern_exit.c
 --- kern/kern_exit.c  5 Jun 2013 00:53:26 -   1.125
 +++ kern/kern_exit.c  16 Aug 2013 05:57:27 -
 @@ -183,10 +183,10 @@ exit1(struct proc *p, int rv, int flags)
 */
 fdfree(p);
 
 + timeout_del(p-p_prof_to);
 if ((p-p_flag  P_THREAD) == 0) {
 timeout_del(pr-ps_realit_to);
 timeout_del(pr-ps_virt_to);
 - timeout_del(pr-ps_prof_to);
 #ifdef SYSVSEM
 semexit(pr);
 #endif
 Index: kern/kern_fork.c
 ===
 RCS file: /cvs/src/sys/kern/kern_fork.c,v
 retrieving revision 1.152
 diff -u -p -r1.152 kern_fork.c
 --- kern/kern_fork.c  11 Jun 2013 13:00:31 -  1.152
 +++ kern/kern_fork.c  16 Aug 2013 05:57:27 -
 @@ -209,7 +209,6 @@ process_new(struct proc *p, struct proce
 
 timeout_set(pr-ps_realit_to, realitexpire, pr);
 timeout_set(pr-ps_virt_to, virttimer_trampoline, pr);
 - timeout_set(pr-ps_prof_to, proftimer_trampoline, pr);
 
 pr-ps_flags = parent-ps_flags  (PS_SUGID | PS_SUGIDEXEC);
 if (parent-ps_session-s_ttyvp != NULL 
 @@ -342,6 +341,7 @@ fork1(struct proc *curp, int exitsig, in
 * Initialize the timeouts.
 */
 timeout_set(p-p_sleep_to, endtsleep, p);
 + timeout_set(p-p_prof_to, proftimer_trampoline, p);
 
 /*
 * Duplicate sub-structures as needed.
 Index: kern/kern_time.c
 ===
 RCS file: /cvs/src/sys/kern/kern_time.c,v
 retrieving revision 1.80
 diff -u -p -r1.80 kern_time.c
 --- kern/kern_time.c  17 Jun 2013 19:11:54 -  1.80
 +++ kern/kern_time.c  16 Aug 2013 05:57:27 -
 @@ -568,7 +568,7 @@ sys_setitimer(struct proc *p, void *v, r
 if (which == ITIMER_VIRTUAL)
 timeout_del(pr-ps_virt_to);
 if (which == ITIMER_PROF)
 - timeout_del(pr-ps_prof_to);
 + timeout_del(p-p_prof_to);
 splx(s);
 }
 
 Index: sys/proc.h
 ===
 RCS file: /cvs/src/sys/sys/proc.h,v
 retrieving revision 1.168
 diff -u -p -r1.168 proc.h
 --- sys/proc.h6 Jun 2013 13:09:37 -   1.168
 +++ sys/proc.h16 Aug 2013 05:57:28 -
 @@ -210,7 +210,6 @@ struct process {
 structtimespec ps_start;  /* starting time. */
 structtimeout ps_realit_to;   /* real-time itimer trampoline. */
 structtimeout ps_virt_to; /* virtual itimer trampoline. */
 - struct  timeout ps_prof_to; /* prof itimer trampoline. */
 
 int   ps_refcnt;  /* Number of references. */
 };
 @@ -337,6 +336,7 @@ struct proc {
 /* End area that is copied on creation. */
 #define   p_endcopy   p_addr
 
 + struct  timeout p_prof_to;  /* prof itimer trampoline. */
 structuser *p_addr;   /* Kernel virtual addr of u-area */
 structmdproc p_md;/* Any machine-dependent fields. */



threaded prof signals

2013-08-16 Thread Ted Unangst
As per http://research.swtch.com/macpprof

We deliver all prof signals to the main thread, which is unlikely to
result in accurate profiling info. I think the diff below fixes things.

Index: kern/kern_clock.c
===
RCS file: /cvs/src/sys/kern/kern_clock.c,v
retrieving revision 1.81
diff -u -p -r1.81 kern_clock.c
--- kern/kern_clock.c   24 Apr 2013 17:29:02 -  1.81
+++ kern/kern_clock.c   16 Aug 2013 05:57:27 -
@@ -173,9 +173,9 @@ virttimer_trampoline(void *v)
 void
 proftimer_trampoline(void *v)
 {
-   struct process *pr = v;
+   struct proc *p = v;
 
-   psignal(pr-ps_mainproc, SIGPROF);
+   psignal(p, SIGPROF);
 }
 
 /*
@@ -200,7 +200,7 @@ hardclock(struct clockframe *frame)
timeout_add(pr-ps_virt_to, 1);
if (timerisset(pr-ps_timer[ITIMER_PROF].it_value) 
itimerdecr(pr-ps_timer[ITIMER_PROF], tick) == 0)
-   timeout_add(pr-ps_prof_to, 1);
+   timeout_add(p-p_prof_to, 1);
}
 
/*
Index: kern/kern_exit.c
===
RCS file: /cvs/src/sys/kern/kern_exit.c,v
retrieving revision 1.125
diff -u -p -r1.125 kern_exit.c
--- kern/kern_exit.c5 Jun 2013 00:53:26 -   1.125
+++ kern/kern_exit.c16 Aug 2013 05:57:27 -
@@ -183,10 +183,10 @@ exit1(struct proc *p, int rv, int flags)
 */
fdfree(p);
 
+   timeout_del(p-p_prof_to);
if ((p-p_flag  P_THREAD) == 0) {
timeout_del(pr-ps_realit_to);
timeout_del(pr-ps_virt_to);
-   timeout_del(pr-ps_prof_to);
 #ifdef SYSVSEM
semexit(pr);
 #endif
Index: kern/kern_fork.c
===
RCS file: /cvs/src/sys/kern/kern_fork.c,v
retrieving revision 1.152
diff -u -p -r1.152 kern_fork.c
--- kern/kern_fork.c11 Jun 2013 13:00:31 -  1.152
+++ kern/kern_fork.c16 Aug 2013 05:57:27 -
@@ -209,7 +209,6 @@ process_new(struct proc *p, struct proce
 
timeout_set(pr-ps_realit_to, realitexpire, pr);
timeout_set(pr-ps_virt_to, virttimer_trampoline, pr);
-   timeout_set(pr-ps_prof_to, proftimer_trampoline, pr);
 
pr-ps_flags = parent-ps_flags  (PS_SUGID | PS_SUGIDEXEC);
if (parent-ps_session-s_ttyvp != NULL 
@@ -342,6 +341,7 @@ fork1(struct proc *curp, int exitsig, in
 * Initialize the timeouts.
 */
timeout_set(p-p_sleep_to, endtsleep, p);
+   timeout_set(p-p_prof_to, proftimer_trampoline, p);
 
/*
 * Duplicate sub-structures as needed.
Index: kern/kern_time.c
===
RCS file: /cvs/src/sys/kern/kern_time.c,v
retrieving revision 1.80
diff -u -p -r1.80 kern_time.c
--- kern/kern_time.c17 Jun 2013 19:11:54 -  1.80
+++ kern/kern_time.c16 Aug 2013 05:57:27 -
@@ -568,7 +568,7 @@ sys_setitimer(struct proc *p, void *v, r
if (which == ITIMER_VIRTUAL)
timeout_del(pr-ps_virt_to);
if (which == ITIMER_PROF)
-   timeout_del(pr-ps_prof_to);
+   timeout_del(p-p_prof_to);
splx(s);
}
 
Index: sys/proc.h
===
RCS file: /cvs/src/sys/sys/proc.h,v
retrieving revision 1.168
diff -u -p -r1.168 proc.h
--- sys/proc.h  6 Jun 2013 13:09:37 -   1.168
+++ sys/proc.h  16 Aug 2013 05:57:28 -
@@ -210,7 +210,6 @@ struct process {
struct  timespec ps_start;  /* starting time. */
struct  timeout ps_realit_to;   /* real-time itimer trampoline. */
struct  timeout ps_virt_to; /* virtual itimer trampoline. */
-   struct  timeout ps_prof_to; /* prof itimer trampoline. */
 
int ps_refcnt;  /* Number of references. */
 };
@@ -337,6 +336,7 @@ struct proc {
 /* End area that is copied on creation. */
 #definep_endcopy   p_addr
 
+   struct  timeout p_prof_to;  /* prof itimer trampoline. */
struct  user *p_addr;   /* Kernel virtual addr of u-area */
struct  mdproc p_md;/* Any machine-dependent fields. */
 



Re: threaded prof signals

2013-08-16 Thread Philip Guenther
On Fri, 16 Aug 2013, Ted Unangst wrote:
 As per http://research.swtch.com/macpprof
 
 We deliver all prof signals to the main thread, which is unlikely to
 result in accurate profiling info. I think the diff below fixes things.

Making ITIMER_PROF per-thread like that matches neither what other OS's do 
nor POSIX.  jsing@ pinged about this last week and my comment then was 
that this seems like a bug in signal delivery, not in the triggering of 
the profile timer: when SIGPROF is delivered, it should go to the current 
thread if it's a possible candidate.  Indeed, that should always be true: 
picking some other thread delays delivery, breaks profiling, and violates 
the requirements on kill(2).

So, I proposed the diff below to jsing, which seemed to solve the 
profiling problem...and then got really busy with some time related 
stuff...

Most of the diff below is reindenting of the TAILQ_FOREACH() loop.

To forestall a question: if some other thread is sigwait()ing but the 
current thread doesn't have the signal blocked, then we currently prefer 
to deliver to the sigwait()ing thread, but this diff prefers the current 
thread.  That's legal as the behavior is unspecified if you sigwait() 
while the signal isn't blocked in all threads.

opinions?

Philip


Index: kern/kern_sig.c
===
RCS file: /cvs/src/sys/kern/kern_sig.c,v
retrieving revision 1.152
diff -u -p -r1.152 kern_sig.c
--- kern/kern_sig.c 1 Jun 2013 04:05:26 -   1.152
+++ kern/kern_sig.c 16 Aug 2013 01:02:52 -
@@ -811,27 +811,39 @@ ptsignal(struct proc *p, int signum, enu
}
 
/*
-* A process-wide signal can be diverted to a different
-* thread that's in sigwait() for this signal.  If there
-* isn't such a thread, then pick a thread that doesn't
-* have it blocked so that the stop/kill consideration
-* isn't delayed.  Otherwise, mark it pending on the
-* main thread.
+* If the current thread can process the signal
+* immediately, either because it's sigwait()ing
+* on it or has it unblocked, then have it take it.
 */
-   TAILQ_FOREACH(q, pr-ps_threads, p_thr_link) {
-   /* ignore exiting threads */
-   if (q-p_flag  P_WEXIT)
-   continue;
+   q = curproc
+   if (q != NULL  q-p_p == pr  (q-p_flag  P_WEXIT) == 0 
+   ((q-p_sigdivert  mask) || (q-p_sigmask  mask) == 0))
+   p = q;
+   else {
+   /*
+* A process-wide signal can be diverted to a
+* different thread that's in sigwait() for this
+* signal.  If there isn't such a thread, then
+* pick a thread that doesn't have it blocked so
+* that the stop/kill consideration isn't
+* delayed.  Otherwise, mark it pending on the
+* main thread.
+*/
+   TAILQ_FOREACH(q, pr-ps_threads, p_thr_link) {
+   /* ignore exiting threads */
+   if (q-p_flag  P_WEXIT)
+   continue;
 
-   /* sigwait: definitely go to this thread */
-   if (q-p_sigdivert  mask) {
-   p = q;
-   break;
-   }
+   /* sigwait: definitely go to this thread */
+   if (q-p_sigdivert  mask) {
+   p = q;
+   break;
+   }
 
-   /* unblocked: possibly go to this thread */
-   if ((q-p_sigmask  mask) == 0)
-   p = q;
+   /* unblocked: possibly go to this thread */
+   if ((q-p_sigmask  mask) == 0)
+   p = q;
+   }
}
}
 



Re: threaded prof signals

2013-08-16 Thread Philip Guenther
On Fri, Aug 16, 2013 at 12:23 AM, Ted Unangst t...@tedunangst.com wrote:

 On Thu, Aug 15, 2013 at 23:39, Philip Guenther wrote:
  Making ITIMER_PROF per-thread like that matches neither what other OS's
 do
  nor POSIX.  jsing@ pinged about this last week and my comment then was
  that this seems like a bug in signal delivery, not in the triggering of
  the profile timer: when SIGPROF is delivered, it should go to the current
  thread if it's a possible candidate.  Indeed, that should always be true:
  picking some other thread delays delivery, breaks profiling, and violates
  the requirements on kill(2).

 Actually, here's my concern. There's only one timeout for the process.
 What happens when two threads are running on two CPUs? Is there a
 guarantee that cpu0 will both set and execute the timeout before cpu1
 sets it, or is there a race where cpu1 will fail to send the signal to
 its thread because cpu0 has already processed the timeout?


You get one signal, which is correct: the criteria is that the process is
supposed to be sent one signal after the process, totaling across threads,
consumes it_value of CPU-time, and then another every time that total
accumulates another it_interval again.  When your process with two threads
has its total crossover while two threads are running, it only crossed
*once*, not twice, so it should only get one signal.

(Well, if it_interval is small enough, smaller than a tick, then I suppose
it's possible for the total to cross twice and generate two signals, but
I don't think that's what you had in mind...)


Philip Guenther


Re: threaded prof signals

2013-08-16 Thread Mike Belopuhov
On 16 August 2013 09:23, Ted Unangst t...@tedunangst.com wrote:
 Actually, here's my concern. There's only one timeout for the process.
 What happens when two threads are running on two CPUs? Is there a
 guarantee that cpu0 will both set and execute the timeout before cpu1
 sets it, or is there a race where cpu1 will fail to send the signal to
 its thread because cpu0 has already processed the timeout?


softclock and hence the timeout processing is done only by the cpu0
currently.



Re: threaded prof signals

2013-08-16 Thread Ted Unangst
On Fri, Aug 16, 2013 at 12:33, Mike Belopuhov wrote:
 On 16 August 2013 09:23, Ted Unangst t...@tedunangst.com wrote:
 Actually, here's my concern. There's only one timeout for the process.
 What happens when two threads are running on two CPUs? Is there a
 guarantee that cpu0 will both set and execute the timeout before cpu1
 sets it, or is there a race where cpu1 will fail to send the signal to
 its thread because cpu0 has already processed the timeout?

 
 softclock and hence the timeout processing is done only by the cpu0
 currently.

That's really good to know. I couldn't remember if it was hardclock or
softclock or what exactly.

In that case, guenther, I think your diff is incorrect. The thread
running on cpu0 may not have been the thread running when the timer
expired. And if the thread on cpu0 is an unrelated process, we're back
to semi random delivery. If you look at my diff again, I didn't actually
create a timer per thread, only a timeout, and the timeout is only
used as a trampoline to get the proc argument from hardclock to
psignal.

(I think your diff may make sense for other reasons, but it's not
always going to send sigprof to the right thread.)



Re: threaded prof signals

2013-08-16 Thread Mark Kettenis
 Date: Thu, 15 Aug 2013 23:39:36 -0700
 From: Philip Guenther guent...@gmail.com
 
 On Fri, 16 Aug 2013, Ted Unangst wrote:
  As per http://research.swtch.com/macpprof
  
  We deliver all prof signals to the main thread, which is unlikely to
  result in accurate profiling info. I think the diff below fixes things.
 
 Making ITIMER_PROF per-thread like that matches neither what other OS's do 
 nor POSIX.  jsing@ pinged about this last week and my comment then was 
 that this seems like a bug in signal delivery, not in the triggering of 
 the profile timer: when SIGPROF is delivered, it should go to the current 
 thread if it's a possible candidate.  Indeed, that should always be true: 
 picking some other thread delays delivery, breaks profiling, and violates 
 the requirements on kill(2).
 
 So, I proposed the diff below to jsing, which seemed to solve the 
 profiling problem...and then got really busy with some time related 
 stuff...
 
 Most of the diff below is reindenting of the TAILQ_FOREACH() loop.
 
 To forestall a question: if some other thread is sigwait()ing but the 
 current thread doesn't have the signal blocked, then we currently prefer 
 to deliver to the sigwait()ing thread, but this diff prefers the current 
 thread.  That's legal as the behavior is unspecified if you sigwait() 
 while the signal isn't blocked in all threads.
 
 opinions?

Wonder if this fixes/alleviates the problem with the excessive ipi's
that people have been complaining about.

Can't find the text in POSIX that says you shouldn't call sigwait()
without blocking signals in *all* threads, but it doesn't say you
shouldn't call it without blocking them.  And the rationale part
clearly does say that the specification of signals is deliberately
vague to allow implementations to deliver signals in the most
convenient way.

Do wonder if we should do the same in the loop over all threads though
to be consistent.

 Index: kern/kern_sig.c
 ===
 RCS file: /cvs/src/sys/kern/kern_sig.c,v
 retrieving revision 1.152
 diff -u -p -r1.152 kern_sig.c
 --- kern/kern_sig.c   1 Jun 2013 04:05:26 -   1.152
 +++ kern/kern_sig.c   16 Aug 2013 01:02:52 -
 @@ -811,27 +811,39 @@ ptsignal(struct proc *p, int signum, enu
   }
  
   /*
 -  * A process-wide signal can be diverted to a different
 -  * thread that's in sigwait() for this signal.  If there
 -  * isn't such a thread, then pick a thread that doesn't
 -  * have it blocked so that the stop/kill consideration
 -  * isn't delayed.  Otherwise, mark it pending on the
 -  * main thread.
 +  * If the current thread can process the signal
 +  * immediately, either because it's sigwait()ing
 +  * on it or has it unblocked, then have it take it.
*/
 - TAILQ_FOREACH(q, pr-ps_threads, p_thr_link) {
 - /* ignore exiting threads */
 - if (q-p_flag  P_WEXIT)
 - continue;
 + q = curproc
 + if (q != NULL  q-p_p == pr  (q-p_flag  P_WEXIT) == 0 
 + ((q-p_sigdivert  mask) || (q-p_sigmask  mask) == 0))
 + p = q;
 + else {
 + /*
 +  * A process-wide signal can be diverted to a
 +  * different thread that's in sigwait() for this
 +  * signal.  If there isn't such a thread, then
 +  * pick a thread that doesn't have it blocked so
 +  * that the stop/kill consideration isn't
 +  * delayed.  Otherwise, mark it pending on the
 +  * main thread.
 +  */
 + TAILQ_FOREACH(q, pr-ps_threads, p_thr_link) {
 + /* ignore exiting threads */
 + if (q-p_flag  P_WEXIT)
 + continue;
  
 - /* sigwait: definitely go to this thread */
 - if (q-p_sigdivert  mask) {
 - p = q;
 - break;
 - }
 + /* sigwait: definitely go to this thread */
 + if (q-p_sigdivert  mask) {
 + p = q;
 + break;
 + }
  
 - /* unblocked: possibly go to this thread */
 - if ((q-p_sigmask  mask) == 0)
 - p = q;
 + /* unblocked: possibly go to this thread */
 + if ((q-p_sigmask  mask) == 0)
 + p = q;
 + }
   }
   }