sys/sched.h: conceal schedstate_percpu definition from userspace

2023-10-29 Thread Scott Cheloha
struct schedstate_percpu contains clockintr pointers.  struct
clockintr is not defined int userspace, so we need to conceal
the schedstate_percpu definition from userspace.  Nothing in
base userspace currently uses the struct.

I think we should leave  in place for now.  Something
might depend upon it to compile.  We could circle back move it under
_KERNEL in a separate patch.

Preferences?  ok?

Index: sched.h
===
RCS file: /cvs/src/sys/sys/sched.h,v
diff -u -p -r1.67 sched.h
--- sched.h 24 Oct 2023 13:20:11 -  1.67
+++ sched.h 29 Oct 2023 20:44:50 -
@@ -88,6 +88,15 @@
 #define CP_IDLE5
 #define CPUSTATES  6
 
+struct cpustats {
+   uint64_tcs_time[CPUSTATES]; /* CPU state statistics */
+   uint64_tcs_flags;   /* see below */
+};
+
+#define CPUSTATS_ONLINE0x0001  /* CPU is schedulable */
+
+#ifdef _KERNEL
+
 #defineSCHED_NQS   32  /* 32 run queues. */
 
 struct clockintr;
@@ -123,15 +132,6 @@ struct schedstate_percpu {
 * without delay */
u_char spc_smrgp;   /* this CPU's view of grace period */
 };
-
-struct cpustats {
-   uint64_tcs_time[CPUSTATES]; /* CPU state statistics */
-   uint64_tcs_flags;   /* see below */
-};
-
-#define CPUSTATS_ONLINE0x0001  /* CPU is schedulable */
-
-#ifdef _KERNEL
 
 /* spc_flags */
 #define SPCF_SEENRR 0x0001  /* process has seen roundrobin() */



Re: bt(5), btrace(8): execute END probe and print maps after exit() statement

2023-10-21 Thread Scott Cheloha
On Sat, Oct 21, 2023 at 07:17:05PM +0200, Martin Pieuchot wrote:
> On 18/10/23(Wed) 12:56, Scott Cheloha wrote:
> > Hi,
> > 
> > A bt(5) exit() statement causes the btrace(8) interpreter to exit(3)
> > immediately.
> > 
> > A BPFtrace exit() statement is more nuanced: the END probe is executed
> > and the contents of all maps are printed before the interpreter exits.
> > 
> > This patch adds a halting check after the execution of each bt(5)
> > statement.  If a statement causes the program to halt, the halt
> > bubbles up to the top-level rule evaluation loop and terminates
> > execution.  rules_teardown() then runs, just as if the program had
> > received SIGTERM.
> > 
> > Two edge-like cases:
> > 
> > 1. You can exit() from BEGIN.  rules_setup() returns non-zero if this
> >happens so the main loop knows to halt immediately.
> > 
> > 2. You can exit() from END.  This is just an early-return: the END probe
> >doesn't run again.
> > 
> > Thoughts?
> 
> Makes sense to ease the transition from bpftrace scripts.  Ok with me if
> you make sure the regression tests still pass.  Some outputs might
> depend on the actual behavior and would need to be updated.

Agh, my mistake, there are two tests that depend on the current
behavior.  I have updated them below.

ok with the test fixes?

Index: usr.sbin/btrace/btrace.c
===
RCS file: /cvs/src/usr.sbin/btrace/btrace.c,v
retrieving revision 1.79
diff -u -p -r1.79 btrace.c
--- usr.sbin/btrace/btrace.c12 Oct 2023 15:16:44 -  1.79
+++ usr.sbin/btrace/btrace.c22 Oct 2023 01:21:21 -
@@ -71,10 +71,10 @@ struct dtioc_probe_info *dtpi_get_by_val
  * Main loop and rule evaluation.
  */
 voidrules_do(int);
-voidrules_setup(int);
-voidrules_apply(int, struct dt_evt *);
+int rules_setup(int);
+int rules_apply(int, struct dt_evt *);
 voidrules_teardown(int);
-voidrule_eval(struct bt_rule *, struct dt_evt *);
+int rule_eval(struct bt_rule *, struct dt_evt *);
 voidrule_printmaps(struct bt_rule *);
 
 /*
@@ -84,7 +84,7 @@ uint64_t   builtin_nsecs(struct dt_evt *
 const char *builtin_kstack(struct dt_evt *);
 const char *builtin_arg(struct dt_evt *, enum bt_argtype);
 struct bt_arg  *fn_str(struct bt_arg *, struct dt_evt *, char *);
-voidstmt_eval(struct bt_stmt *, struct dt_evt *);
+int stmt_eval(struct bt_stmt *, struct dt_evt *);
 voidstmt_bucketize(struct bt_stmt *, struct dt_evt *);
 voidstmt_clear(struct bt_stmt *);
 voidstmt_delete(struct bt_stmt *, struct dt_evt *);
@@ -405,6 +405,7 @@ void
 rules_do(int fd)
 {
struct sigaction sa;
+   int halt = 0;
 
memset(, 0, sizeof(sa));
sigemptyset(_mask);
@@ -415,9 +416,9 @@ rules_do(int fd)
if (sigaction(SIGTERM, , NULL))
err(1, "sigaction");
 
-   rules_setup(fd);
+   halt = rules_setup(fd);
 
-   while (!quit_pending && g_nprobes > 0) {
+   while (!quit_pending && !halt && g_nprobes > 0) {
static struct dt_evt devtbuf[64];
ssize_t rlen;
size_t i;
@@ -434,8 +435,11 @@ rules_do(int fd)
if ((rlen % sizeof(struct dt_evt)) != 0)
err(1, "incorrect read");
 
-   for (i = 0; i < rlen / sizeof(struct dt_evt); i++)
-   rules_apply(fd, [i]);
+   for (i = 0; i < rlen / sizeof(struct dt_evt); i++) {
+   halt = rules_apply(fd, [i]);
+   if (halt)
+   break;
+   }
}
 
rules_teardown(fd);
@@ -484,7 +488,7 @@ rules_action_scan(struct bt_stmt *bs)
return evtflags;
 }
 
-void
+int
 rules_setup(int fd)
 {
struct dtioc_probe_info *dtpi;
@@ -493,7 +497,7 @@ rules_setup(int fd)
struct bt_probe *bp;
struct bt_stmt *bs;
struct bt_arg *ba;
-   int dokstack = 0, on = 1;
+   int dokstack = 0, halt = 0, on = 1;
uint64_t evtflags;
 
TAILQ_FOREACH(r, _rules, br_next) {
@@ -553,7 +557,7 @@ rules_setup(int fd)
clock_gettime(CLOCK_REALTIME, _devt.dtev_tsp);
 
if (rbegin)
-   rule_eval(rbegin, _devt);
+   halt = rule_eval(rbegin, _devt);
 
/* Enable all probes */
TAILQ_FOREACH(r, _rules, br_next) {
@@ -571,9 +575,14 @@ rules_setup(int fd)
if (ioctl(fd, DTIOCRECORD, ))
err(1, "DTIOCRECORD");

bt(5), btrace(8): execute END probe and print maps after exit() statement

2023-10-18 Thread Scott Cheloha
Hi,

A bt(5) exit() statement causes the btrace(8) interpreter to exit(3)
immediately.

A BPFtrace exit() statement is more nuanced: the END probe is executed
and the contents of all maps are printed before the interpreter exits.

This patch adds a halting check after the execution of each bt(5)
statement.  If a statement causes the program to halt, the halt
bubbles up to the top-level rule evaluation loop and terminates
execution.  rules_teardown() then runs, just as if the program had
received SIGTERM.

Two edge-like cases:

1. You can exit() from BEGIN.  rules_setup() returns non-zero if this
   happens so the main loop knows to halt immediately.

2. You can exit() from END.  This is just an early-return: the END probe
   doesn't run again.

Thoughts?

$ btrace -e '
BEGIN {
@[probe] = "reached";
exit();
@[probe] = "not reached";
}
END {
@[probe] = "reached";
exit();
@[probe] = "not reached";
}'

Index: btrace.c
===
RCS file: /cvs/src/usr.sbin/btrace/btrace.c,v
retrieving revision 1.79
diff -u -p -r1.79 btrace.c
--- btrace.c12 Oct 2023 15:16:44 -  1.79
+++ btrace.c18 Oct 2023 17:54:16 -
@@ -71,10 +71,10 @@ struct dtioc_probe_info *dtpi_get_by_val
  * Main loop and rule evaluation.
  */
 voidrules_do(int);
-voidrules_setup(int);
-voidrules_apply(int, struct dt_evt *);
+int rules_setup(int);
+int rules_apply(int, struct dt_evt *);
 voidrules_teardown(int);
-voidrule_eval(struct bt_rule *, struct dt_evt *);
+int rule_eval(struct bt_rule *, struct dt_evt *);
 voidrule_printmaps(struct bt_rule *);
 
 /*
@@ -84,7 +84,7 @@ uint64_t   builtin_nsecs(struct dt_evt *
 const char *builtin_kstack(struct dt_evt *);
 const char *builtin_arg(struct dt_evt *, enum bt_argtype);
 struct bt_arg  *fn_str(struct bt_arg *, struct dt_evt *, char *);
-voidstmt_eval(struct bt_stmt *, struct dt_evt *);
+int stmt_eval(struct bt_stmt *, struct dt_evt *);
 voidstmt_bucketize(struct bt_stmt *, struct dt_evt *);
 voidstmt_clear(struct bt_stmt *);
 voidstmt_delete(struct bt_stmt *, struct dt_evt *);
@@ -405,6 +405,7 @@ void
 rules_do(int fd)
 {
struct sigaction sa;
+   int halt = 0;
 
memset(, 0, sizeof(sa));
sigemptyset(_mask);
@@ -415,9 +416,9 @@ rules_do(int fd)
if (sigaction(SIGTERM, , NULL))
err(1, "sigaction");
 
-   rules_setup(fd);
+   halt = rules_setup(fd);
 
-   while (!quit_pending && g_nprobes > 0) {
+   while (!quit_pending && !halt && g_nprobes > 0) {
static struct dt_evt devtbuf[64];
ssize_t rlen;
size_t i;
@@ -434,8 +435,11 @@ rules_do(int fd)
if ((rlen % sizeof(struct dt_evt)) != 0)
err(1, "incorrect read");
 
-   for (i = 0; i < rlen / sizeof(struct dt_evt); i++)
-   rules_apply(fd, [i]);
+   for (i = 0; i < rlen / sizeof(struct dt_evt); i++) {
+   halt = rules_apply(fd, [i]);
+   if (halt)
+   break;
+   }
}
 
rules_teardown(fd);
@@ -484,7 +488,7 @@ rules_action_scan(struct bt_stmt *bs)
return evtflags;
 }
 
-void
+int
 rules_setup(int fd)
 {
struct dtioc_probe_info *dtpi;
@@ -493,7 +497,7 @@ rules_setup(int fd)
struct bt_probe *bp;
struct bt_stmt *bs;
struct bt_arg *ba;
-   int dokstack = 0, on = 1;
+   int dokstack = 0, halt = 0, on = 1;
uint64_t evtflags;
 
TAILQ_FOREACH(r, _rules, br_next) {
@@ -553,7 +557,7 @@ rules_setup(int fd)
clock_gettime(CLOCK_REALTIME, _devt.dtev_tsp);
 
if (rbegin)
-   rule_eval(rbegin, _devt);
+   halt = rule_eval(rbegin, _devt);
 
/* Enable all probes */
TAILQ_FOREACH(r, _rules, br_next) {
@@ -571,9 +575,14 @@ rules_setup(int fd)
if (ioctl(fd, DTIOCRECORD, ))
err(1, "DTIOCRECORD");
}
+
+   return halt;
 }
 
-void
+/*
+ * Returns non-zero if the program should halt.
+ */
+int
 rules_apply(int fd, struct dt_evt *dtev)
 {
struct bt_rule *r;
@@ -586,9 +595,11 @@ rules_apply(int fd, struct dt_evt *dtev)
continue;
 
dtai_cache(fd, _dtpis[dtev->dtev_pbn - 1]);
-   rule_eval(r, dtev);
+   if (rule_eval(r, dtev))
+   return 1;
}
}
+   return 0;
 }
 
 void
@@ -637,7 +648,10 @@ rules_teardown(int fd)

Re: timeout(1): align execvp(3) failure statuses with GNU timeout

2023-10-16 Thread Scott Cheloha
On Sun, Oct 15, 2023 at 01:48:13PM -0600, Todd C. Miller wrote:
> On Sun, 15 Oct 2023 13:53:46 -0500, Scott Cheloha wrote:
> 
> > Align timeout(1)'s execvp(3) failure statuses with those of GNU
> > timeout.  127 for ENOENT, 126 for everything else.
> 
> Looks correct to me.  OK millert@

Per deraadt@, update EXIT STATUS, too.

Do we need to keep the "128 + signal" bit?  I thought that was a
normal Unix thing, and we tend to avoid saying things that are
implicitly true for every utility.

Index: timeout.c
===
RCS file: /cvs/src/usr.bin/timeout/timeout.c,v
retrieving revision 1.25
diff -u -p -r1.25 timeout.c
--- timeout.c   13 Jan 2023 06:53:04 -  1.25
+++ timeout.c   16 Oct 2023 15:29:02 -
@@ -260,7 +260,8 @@ main(int argc, char **argv)
signal(SIGTTOU, SIG_DFL);
 
execvp(argv[0], argv);
-   err(1, "%s", argv[0]);
+   warn("%s", argv[0]);
+   _exit(errno == ENOENT ? 127 : 126);
}
 
/* parent continues here */
Index: timeout.1
===
RCS file: /cvs/src/usr.bin/timeout/timeout.1,v
retrieving revision 1.7
diff -u -p -r1.7 timeout.1
--- timeout.1   12 Jan 2023 14:08:39 -  1.7
+++ timeout.1   16 Oct 2023 15:29:02 -
@@ -92,20 +92,29 @@ hours
 days
 .El
 .Sh EXIT STATUS
-If the timeout was not reached or
-.Fl p
-was set, the exit status of
-.Ar command
-is returned.
+The
+.Nm
+utility may return one of the following statuses:
 .Pp
-If the timeout was reached and
+.Bl -tag -compact
+.It 124
+The time limit expired and the
 .Fl p
-was not set, an exit status of 124 is returned.
-.Pp
-If
+flag was not set.
+.It 126
+The
 .Ar command
-exited after receiving a signal, the exit status returned is the signal number
-plus 128.
+could not be executed.
+.It 127
+The
+.Ar command
+was not found.
+.El
+.Pp
+Otherwise,
+.Nm
+returns the exit status of the
+.Ar command .
 .Sh SEE ALSO
 .Xr kill 1 ,
 .Xr signal 3



timeout(1): align execvp(3) failure statuses with GNU timeout

2023-10-15 Thread Scott Cheloha
Align timeout(1)'s execvp(3) failure statuses with those of GNU
timeout.  127 for ENOENT, 126 for everything else.

$ cd /tmp
$ touch script.sh
$ ls -l script.sh
-rw-r--r--  1 ssc  wheel  0 Oct 15 13:43 script.sh
$ gtimeout 1.0 ./script.sh ; echo $?
gtimeout: failed to run command './script.sh': Permission denied
126
$ timeout 1.0 ./script.sh ; echo $?
timeout: ./script.sh: Permission denied
1
$ gtimeout 1.0 ./not-a-script.sh ; echo $?
gtimeout: failed to run command './not-a-script.sh': No such file or directory
127
$ timeout 1.0 ./not-a-script.sh ; echo $?
timeout: ./not-a-script.sh: No such file or directory
1

While here, _exit(2) from the child process instead of exit(3).

ok?

Index: timeout.c
===
RCS file: /cvs/src/usr.bin/timeout/timeout.c,v
retrieving revision 1.25
diff -u -p -r1.25 timeout.c
--- timeout.c   13 Jan 2023 06:53:04 -  1.25
+++ timeout.c   15 Oct 2023 18:47:04 -
@@ -260,7 +260,8 @@ main(int argc, char **argv)
signal(SIGTTOU, SIG_DFL);
 
execvp(argv[0], argv);
-   err(1, "%s", argv[0]);
+   warn("%s", argv[0]);
+   _exit(errno == ENOENT ? 127 : 126);
}
 
/* parent continues here */



Re: tcp syn cache unlock

2023-10-11 Thread Scott Cheloha
On Tue, Oct 10, 2023 at 05:26:14PM +0300, Vitaliy Makkoveev wrote:
> On Tue, Oct 10, 2023 at 09:06:23AM -0500, Scott Cheloha wrote:
> > On Fri, Oct 06, 2023 at 03:41:39PM +0200, Alexander Bluhm wrote:
> > > On Fri, Oct 06, 2023 at 03:47:31PM +0300, Vitaliy Makkoveev wrote:
> > > > On Fri, Oct 06, 2023 at 02:14:52PM +0200, Alexander Bluhm wrote:
> > > > > > @@ -718,11 +743,13 @@ softclock(void *arg)
> > > > > > softclock_process_tick_timeout(to, new);
> > > > > > }
> > > > > > tostat.tos_softclocks++;
> > > > > > -   needsproc = !CIRCQ_EMPTY(_proc);
> > > > > > -   mtx_leave(_mutex);
> > > > > > -
> > > > > > -   if (needsproc)
> > > > > > +   if (!CIRCQ_EMPTY(_proc))
> > > > > > wakeup(_proc);
> > > > > > +#ifdef MULTIPROCESSOR
> > > > > > +   if(!CIRCQ_EMPTY(_proc_mpsafe))
> > > > > > +   wakeup(_proc_mpsafe);
> > > > > > +#endif
> > > > > > +   mtx_leave(_mutex);
> > > > > >  }
> > > > > >
> > > > > >  void
> > > > >
> > > > > Was there a good reason that wakeup() did run without mutex?
> > > > > Do we really want to change this?
> > > > >
> > > >
> > > > I dont understand you. Original code does wakeup() outside mutex. I
> > > > moved wakeup() under mutex. You want to move it back?
> > > 
> > > I just wanted to know why you moved it.
> > > 
> > > Now I see.  You use msleep_nsec() with timeout_mutex.  Putting
> > > wakeup in mutex ensures that you don't miss it.
> > 
> > Do we actually need to move the softclock() wakeup calls into the
> > mutex?  As long as CIRCQ_EMPTY(...) is evaluated within timeout_mutex,
> > the thread can't get stuck waiting for a wakeup that isn't coming.
> > Both threads now sleep via msleep_nsec(), so there is no "gap" between
> > evaluation and unlock.
> > 
> > Am I missing something else?
> 
> In other hand, why to not move them under the `timeout_mutex' mutex(9)?
> Does this unlocked call provides something significant?

If you want to move the wakeups into timeout_mutex, let's do it in a
separate patch.

However, near as I can tell, keeping the calls where they are is still
correct.  Please speak up if this is not true.

And note that all the wakeup() and softintr_schedule() calls are
currently made outside of timeout_mutex.  Moving them for no reason
feels like we are buying trouble.

> > > Nitpick: timeoutmp_proc should be timeout_procmp.  timeout_ is the
> > > prefix in this file.  mp suffix is easier to see at the end.
> > > 
> > > >+   if (kthread_create(softclockmp_thread, NULL, NULL, "softclockm"))
> > > "softclockm" -> "softclockmp"
> > > 
> > > OK bluhm@, but let's wait for cheloha@ and see what he thinks
> > 
> > Revised patch:
> > 
> > - Add TIMEOUT_MPSAFE support to timeout_barrier().  This is crucial.
> > - Keep the names in the existing namespaces where possible.
> > - Keep the wakeup(9) calls in softclock() outside of timeout_mutex.
> >   ... unless I have made an error, they can stay where they are.
> > - Trim the processing loops in the threads.
> > - Tweak the ddb(4) printing code to distinguish the locked and
> >   unlocked thread circqs.
> > 
> > mvs/bluhm: try this with your favorite process-context timeout and
> > make sure the timeouts still run.
> > 
> > Assuming everything works, ok?
> > 
> 
> ok by me, with the one nit:
> 
> > +   msleep_nsec(_proc, _mutex, PSWP, "bored",
> > +   INFSLP);
> 
> "bored" is used by tasks. Can you use another ident?

"tmoslp" matches up with the "tmobar" wmesg in timeout_barrier(), so
let's try "tmoslp".

I will commit this tomorrow unless I hear otherwise.

Index: share/man/man9/timeout.9
===
RCS file: /cvs/src/share/man/man9/timeout.9,v
retrieving revision 1.56
diff -u -p -r1.56 timeout.9
--- share/man/man9/timeout.91 Jan 2023 01:19:18 -   1.56
+++ share/man/man9/timeout.912 Oct 2023 02:06:29 -
@@ -193,11 +193,16 @@ Counts the time elapsed since the system
 The timeout's behavior may be configured with the bitwise OR of
 zero or more of the following
 .Fa flags :
-.Bl -tag -width TIMEOUT_PROC
+.Bl -tag -width TIMEOUT_MPSAFE
 .It Dv TIMEOUT_PROC

bt(5), btrace(8): support modulo operator ('%')

2023-10-10 Thread Scott Cheloha
This adds support for the modulo operator to btrace(8).

I was trying to use it like this:

$start = nsecs;
/* ... */
$elapsed = nsecs - $start;
printf("%d.%09d seconds\n",
$elapsed / 10, $elapsed % 10);

and noticed it was missing.

ok?

Index: bt_parse.y
===
RCS file: /cvs/src/usr.sbin/btrace/bt_parse.y,v
retrieving revision 1.53
diff -u -p -r1.53 bt_parse.y
--- bt_parse.y  11 Sep 2023 19:01:26 -  1.53
+++ bt_parse.y  10 Oct 2023 16:07:45 -
@@ -184,7 +184,7 @@ filter  : /* empty */   { $$ = NULL; }
  * Give higher precedence to:
  *  1. && and ||
  *  2. ==, !=, <<, <, >=, >, +, =, &, ^, |
- *  3. * and /
+ *  3. * , / , %
  */
 expr   : expr OP_LAND term { $$ = ba_op(B_AT_OP_LAND, $1, $3); }
| expr OP_LOR term  { $$ = ba_op(B_AT_OP_LOR, $1, $3); }
@@ -207,6 +207,7 @@ term: term OP_EQ fterm  { $$ = ba_op(B_A
 
 fterm  : fterm '*' factor  { $$ = ba_op(B_AT_OP_MULT, $1, $3); }
| fterm '/' factor  { $$ = ba_op(B_AT_OP_DIVIDE, $1, $3); }
+   | fterm '%' factor  { $$ = ba_op(B_AT_OP_MODULO, $1, $3); }
| factor
;
 
Index: bt_parser.h
===
RCS file: /cvs/src/usr.sbin/btrace/bt_parser.h,v
retrieving revision 1.24
diff -u -p -r1.24 bt_parser.h
--- bt_parser.h 11 Sep 2023 19:01:26 -  1.24
+++ bt_parser.h 10 Oct 2023 16:07:45 -
@@ -163,6 +163,7 @@ struct bt_arg {
B_AT_OP_MINUS,
B_AT_OP_MULT,
B_AT_OP_DIVIDE,
+   B_AT_OP_MODULO,
B_AT_OP_BAND,
B_AT_OP_XOR,
B_AT_OP_BOR,
Index: btrace.c
===
RCS file: /cvs/src/usr.sbin/btrace/btrace.c,v
retrieving revision 1.78
diff -u -p -r1.78 btrace.c
--- btrace.c15 Sep 2023 10:59:02 -  1.78
+++ btrace.c10 Oct 2023 16:07:45 -
@@ -1416,6 +1416,9 @@ baexpr2long(struct bt_arg *ba, struct dt
case B_AT_OP_DIVIDE:
result = lval / rval;
break;
+   case B_AT_OP_MODULO:
+   result = lval % rval;
+   break;
case B_AT_OP_BAND:
result = lval & rval;
break;
@@ -1526,6 +1529,8 @@ ba_name(struct bt_arg *ba)
return "*";
case B_AT_OP_DIVIDE:
return "/";
+   case B_AT_OP_MODULO:
+   return "%";
case B_AT_OP_BAND:
return "&";
case B_AT_OP_XOR:



Re: tcp syn cache unlock

2023-10-10 Thread Scott Cheloha
On Fri, Oct 06, 2023 at 03:41:39PM +0200, Alexander Bluhm wrote:
> On Fri, Oct 06, 2023 at 03:47:31PM +0300, Vitaliy Makkoveev wrote:
> > On Fri, Oct 06, 2023 at 02:14:52PM +0200, Alexander Bluhm wrote:
> > > > @@ -718,11 +743,13 @@ softclock(void *arg)
> > > > softclock_process_tick_timeout(to, new);
> > > > }
> > > > tostat.tos_softclocks++;
> > > > -   needsproc = !CIRCQ_EMPTY(_proc);
> > > > -   mtx_leave(_mutex);
> > > > -
> > > > -   if (needsproc)
> > > > +   if (!CIRCQ_EMPTY(_proc))
> > > > wakeup(_proc);
> > > > +#ifdef MULTIPROCESSOR
> > > > +   if(!CIRCQ_EMPTY(_proc_mpsafe))
> > > > +   wakeup(_proc_mpsafe);
> > > > +#endif
> > > > +   mtx_leave(_mutex);
> > > >  }
> > > >
> > > >  void
> > >
> > > Was there a good reason that wakeup() did run without mutex?
> > > Do we really want to change this?
> > >
> >
> > I dont understand you. Original code does wakeup() outside mutex. I
> > moved wakeup() under mutex. You want to move it back?
> 
> I just wanted to know why you moved it.
> 
> Now I see.  You use msleep_nsec() with timeout_mutex.  Putting
> wakeup in mutex ensures that you don't miss it.

Do we actually need to move the softclock() wakeup calls into the
mutex?  As long as CIRCQ_EMPTY(...) is evaluated within timeout_mutex,
the thread can't get stuck waiting for a wakeup that isn't coming.
Both threads now sleep via msleep_nsec(), so there is no "gap" between
evaluation and unlock.

Am I missing something else?

> Nitpick: timeoutmp_proc should be timeout_procmp.  timeout_ is the
> prefix in this file.  mp suffix is easier to see at the end.
> 
> >+   if (kthread_create(softclockmp_thread, NULL, NULL, "softclockm"))
> "softclockm" -> "softclockmp"
> 
> OK bluhm@, but let's wait for cheloha@ and see what he thinks

Revised patch:

- Add TIMEOUT_MPSAFE support to timeout_barrier().  This is crucial.
- Keep the names in the existing namespaces where possible.
- Keep the wakeup(9) calls in softclock() outside of timeout_mutex.
  ... unless I have made an error, they can stay where they are.
- Trim the processing loops in the threads.
- Tweak the ddb(4) printing code to distinguish the locked and
  unlocked thread circqs.

mvs/bluhm: try this with your favorite process-context timeout and
make sure the timeouts still run.

Assuming everything works, ok?

Index: share/man/man9/timeout.9
===
RCS file: /cvs/src/share/man/man9/timeout.9,v
retrieving revision 1.56
diff -u -p -r1.56 timeout.9
--- share/man/man9/timeout.91 Jan 2023 01:19:18 -   1.56
+++ share/man/man9/timeout.910 Oct 2023 13:44:02 -
@@ -193,11 +193,16 @@ Counts the time elapsed since the system
 The timeout's behavior may be configured with the bitwise OR of
 zero or more of the following
 .Fa flags :
-.Bl -tag -width TIMEOUT_PROC
+.Bl -tag -width TIMEOUT_MPSAFE
 .It Dv TIMEOUT_PROC
 Execute the timeout in a process context instead of the default
 .Dv IPL_SOFTCLOCK
 interrupt context.
+.It Dv TIMEOUT_MPSAFE
+Execute the timeout without the kernel lock.
+Requires the
+.Dv TIMEOUT_PROC
+flag.
 .El
 .El
 .Pp
@@ -367,8 +372,9 @@ The function
 .Fa fn
 must not block and must be safe to execute on any CPU in the system.
 .Pp
-Currently,
-all timeouts are executed under the kernel lock.
+Timeouts without the
+.Dv TIMEOUT_MPSAFE
+flag are executed under the kernel lock.
 .Sh RETURN VALUES
 .Fn timeout_add ,
 .Fn timeout_add_sec ,
Index: sys/sys/timeout.h
===
RCS file: /cvs/src/sys/sys/timeout.h,v
retrieving revision 1.47
diff -u -p -r1.47 timeout.h
--- sys/sys/timeout.h   31 Dec 2022 16:06:24 -  1.47
+++ sys/sys/timeout.h   10 Oct 2023 13:44:02 -
@@ -54,6 +54,7 @@ struct timeout {
 #define TIMEOUT_ONQUEUE0x02/* on any timeout queue */
 #define TIMEOUT_INITIALIZED0x04/* initialized */
 #define TIMEOUT_TRIGGERED  0x08/* running or ran */
+#define TIMEOUT_MPSAFE 0x10/* run without kernel lock */
 
 struct timeoutstat {
uint64_t tos_added; /* timeout_add*(9) calls */
Index: sys/kern/kern_timeout.c
===
RCS file: /cvs/src/sys/kern/kern_timeout.c,v
retrieving revision 1.95
diff -u -p -r1.95 kern_timeout.c
--- sys/kern/kern_timeout.c 29 Jul 2023 06:52:08 -  1.95
+++ sys/kern/kern_timeout.c 10 Oct 2023 13:44:02 -
@@ -75,6 +75,9 @@ struct circq timeout_wheel_kc[BUCKETS];   
 struct circq timeout_new;  /* [T] New, unscheduled timeouts */
 struct circq timeout_todo; /* [T] Due or needs rescheduling */
 struct circq timeout_proc; /* [T] Due + needs process context */
+#ifdef MULTIPROCESSOR
+struct circq timeout_proc_mp;  /* [T] Process ctx + no kernel lock */
+#endif
 
 time_t 

Re: tcp syn cache unlock

2023-10-05 Thread Scott Cheloha
On Thu, Oct 05, 2023 at 12:57:24AM +0200, Alexander Bluhm wrote:
> 
> This is a first step to unlock TCP syn cache.  The timer function
> is independent of the socket code.  That makes it easy to start
> there.
> 
> [...]
> 
> Still missing:
> - [...]
> - Run timer without kernel lock.  I am not aware of such a feature.
>   There is already some network code that could benefit from that.
>   Can we get timer without kernel lock like TASKQ_MPSAFE implements
>   it for tasks?

This patch adds a TIMEOUT_MPSAFE flag for use with TIMEOUT_PROC.
Softint timeouts are a different story.

To run syn_cache_timer() without the kernel lock you would initialize
it like this:

timeout_set_flags(>sc_timer, syn_cache_timer, sc, KCLOCK_NONE,
TIMEOUT_PROC | TIMEOUT_MPSAFE);

Use with caution, this needs another set of eyes.

Index: share/man/man9/timeout.9
===
RCS file: /cvs/src/share/man/man9/timeout.9,v
retrieving revision 1.56
diff -u -p -r1.56 timeout.9
--- share/man/man9/timeout.91 Jan 2023 01:19:18 -   1.56
+++ share/man/man9/timeout.95 Oct 2023 16:09:33 -
@@ -193,11 +193,16 @@ Counts the time elapsed since the system
 The timeout's behavior may be configured with the bitwise OR of
 zero or more of the following
 .Fa flags :
-.Bl -tag -width TIMEOUT_PROC
+.Bl -tag -width TIMEOUT_MPSAFE
 .It Dv TIMEOUT_PROC
 Execute the timeout in a process context instead of the default
 .Dv IPL_SOFTCLOCK
 interrupt context.
+.It Dv TIMEOUT_MPSAFE
+Execute the timeout without the kernel lock.
+Requires the
+.Dv TIMEOUT_PROC
+flag.
 .El
 .El
 .Pp
@@ -367,8 +372,9 @@ The function
 .Fa fn
 must not block and must be safe to execute on any CPU in the system.
 .Pp
-Currently,
-all timeouts are executed under the kernel lock.
+Timeouts without the
+.Dv TIMEOUT_MPSAFE
+flag are executed under the kernel lock.
 .Sh RETURN VALUES
 .Fn timeout_add ,
 .Fn timeout_add_sec ,
Index: sys/sys/timeout.h
===
RCS file: /cvs/src/sys/sys/timeout.h,v
retrieving revision 1.47
diff -u -p -r1.47 timeout.h
--- sys/sys/timeout.h   31 Dec 2022 16:06:24 -  1.47
+++ sys/sys/timeout.h   5 Oct 2023 16:09:33 -
@@ -54,6 +54,7 @@ struct timeout {
 #define TIMEOUT_ONQUEUE0x02/* on any timeout queue */
 #define TIMEOUT_INITIALIZED0x04/* initialized */
 #define TIMEOUT_TRIGGERED  0x08/* running or ran */
+#define TIMEOUT_MPSAFE 0x10/* run without kernel lock */
 
 struct timeoutstat {
uint64_t tos_added; /* timeout_add*(9) calls */
Index: sys/kern/kern_timeout.c
===
RCS file: /cvs/src/sys/kern/kern_timeout.c,v
retrieving revision 1.95
diff -u -p -r1.95 kern_timeout.c
--- sys/kern/kern_timeout.c 29 Jul 2023 06:52:08 -  1.95
+++ sys/kern/kern_timeout.c 5 Oct 2023 16:09:34 -
@@ -75,6 +75,7 @@ struct circq timeout_wheel_kc[BUCKETS];   
 struct circq timeout_new;  /* [T] New, unscheduled timeouts */
 struct circq timeout_todo; /* [T] Due or needs rescheduling */
 struct circq timeout_proc; /* [T] Due + needs process context */
+struct circq timeout_proc_mpsafe;  /* [T] Process ctx + no kernel lock */
 
 time_t timeout_level_width[WHEELCOUNT];/* [I] Wheel level width 
(seconds) */
 struct timespec tick_ts;   /* [I] Length of a tick (1/hz secs) */
@@ -228,6 +229,7 @@ timeout_startup(void)
CIRCQ_INIT(_new);
CIRCQ_INIT(_todo);
CIRCQ_INIT(_proc);
+   CIRCQ_INIT(_proc_mpsafe);
for (b = 0; b < nitems(timeout_wheel); b++)
CIRCQ_INIT(_wheel[b]);
for (b = 0; b < nitems(timeout_wheel_kc); b++)
@@ -261,10 +263,16 @@ void
 timeout_set_flags(struct timeout *to, void (*fn)(void *), void *arg, int 
kclock,
 int flags)
 {
+   KASSERT(!ISSET(flags, ~(TIMEOUT_PROC | TIMEOUT_MPSAFE)));
+
to->to_func = fn;
to->to_arg = arg;
to->to_kclock = kclock;
to->to_flags = flags | TIMEOUT_INITIALIZED;
+
+   /* For now, only process context timeouts may be marked MP-safe. */
+   if (ISSET(to->to_flags, TIMEOUT_MPSAFE))
+   KASSERT(ISSET(to->to_flags, TIMEOUT_PROC));
 }
 
 void
@@ -659,7 +667,10 @@ softclock_process_kclock_timeout(struct 
if (!new && timespeccmp(>to_abstime, >kc_late, <=))
tostat.tos_late++;
if (ISSET(to->to_flags, TIMEOUT_PROC)) {
-   CIRCQ_INSERT_TAIL(_proc, >to_list);
+   if (ISSET(to->to_flags, TIMEOUT_MPSAFE))
+   CIRCQ_INSERT_TAIL(_proc_mpsafe, >to_list);
+   else
+   CIRCQ_INSERT_TAIL(_proc, >to_list);
return;
}
timeout_run(to);
@@ -681,7 +692,10 @@ softclock_process_tick_timeout(struct ti
if (!new && delta < 0)

Re: dt(4), hardclock(9): move interval, profile providers to dedicated callback

2023-09-17 Thread Scott Cheloha
v2 is attached.

Clockintrs now have an argument.  If we pass the PCB as argument, we
can avoid doing a linear search to find the PCB during the interrupt.

One thing I'm unsure about is whether I need to add a "barrier" flag
to clockintr_cancel() and/or clockintr_disestablish().  This would
cause the caller to block until the clockintr callback function has
finished executing, which would ensure that it was safe to free the
PCB.

You're already using SMR, though, so a barrier may be totally
unnecessary.

On Mon, Sep 04, 2023 at 01:39:25PM +0100, Martin Pieuchot wrote:
> On 25/08/23(Fri) 21:00, Scott Cheloha wrote:
> > On Thu, Aug 24, 2023 at 07:21:29PM +0200, Martin Pieuchot wrote:
> > > [...] 
> > > The only behavior that needs to be preserved is the output of dumping
> > > stacks.  That means DT_FA_PROFILE and DT_FA_STATIC certainly needs to
> > > be adapted with this change.  You can figure that out by looking at the
> > > output of /usr/src/share/btrace/kprofile.bt without and with this diff.
> > > 
> > > Please generate a FlameGraph to make sure they're still the same.
> > 
> > dt_prov_profile_intr() runs at the same stack depth as hardclock(), so
> > indeed they are still the same.
> 
> Lovely.
> 
> > > Apart from that I'd prefer if we could skip the mechanical change and
> > > go straight to what dt(4) needs.  Otherwise we will have to re-design
> > > everything.
> > 
> > I think a mechanical "move the code from point A to point B" patch is
> > useful.  It makes the changes easier to follow when tracing the
> > revision history in the future.
> > 
> > If you insist on skipping it, though, I think I can live without it.
> 
> I do insist.  It is really hard for me to follow and work with you
> because you're too verbose for my capacity.  If you want to work with
> me, please do smaller steps and do not mix so much in big diffs.  I
> have plenty of possible comments but can deal with huge chunks.
> 
> > > The current code assumes the periodic entry points are external to dt(4).
> > > This diff moves them in the middle of dt(4) but keeps the existing flow
> > > which makes the code very convoluted.
> > > 
> > > A starting point to understand the added complexity it so see that the
> > > DT_ENTER() macro are no longer needed if we move the entry points inside
> > > dt(4).
> > 
> > I did see that.  It seems really easy to remove the macros in a
> > subsequent patch, though.
> > 
> > Again, if you want to do it all in one patch that's OK.
> 
> Yes please.
> 
> > > The first periodic timeout is dt_prov_interval_enter().  It could be
> > > implemented as a per-PCB timeout_add_nsec(9).  The drawback of this
> > > approach is that it uses too much code in the kernel which is a problem
> > > when instrumenting the kernel itself.  Every subsystem used by dt(4) is
> > > impossible to instrument with btrace(8).
> > 
> > I think you can avoid this instrumentation problem by using clockintr,
> > where the callback functions are run from the hardware interrupt
> > context, just like hardclock().
> 
> Fair enough.
> 
> > > The second periodic timeout it dt_prov_profile_enter().  It is similar
> > > to the previous one and has to run on every CPU.
> > > 
> > > Both are currently bound to tick, but we want per-PCB time resolution.
> > > We can get rid of `dp_nticks' and `dp_maxtick' if we control when the
> > > timeouts fires.
> > 
> > My current thought is that each PCB could have its own "dp_period" or
> > "dp_interval", a count of nanoseconds.
> 
> Indeed.  We can have `dp_nsecs' and use that to determine if
> clockintr_advance() needs to be called in dt_ioctl_record_start().

I have added dt_pcb.dp_nsecs.

> > > [...]
> > 
> > The goal of clockintr is to provide a machine-independent API for
> > scheduling clock interrupts.  You can use it to implement something
> > like hardclock() or statclock().  We are already using it to implement
> > these functions, among others.
> 
> After reading all the code and the previous manuals, I understand it as
> a low-level per-CPU timeout API with nanosecond precision.  Is that it?

Yes.

The distinguishing feature is that it is usually wired up to a
platform backend, so it can deliver the interrupt at the requested
expiration time with relatively low error.

> > > >   One alternative is to start running the clock interrupts when they
> > > >   are allocated in dtpv_alloc() and stop them when they are freed in
> > > >

scheduler_start: move static timeout structs into callback functions

2023-09-16 Thread Scott Cheloha
The schedcpu() and loadavg() timeout structures are already hidden
from the global namespace.  We can further simplify scheduler_start()
by moving the structures into the callback functions and statically
initializing them with TIMEOUT_INITIALIZER(9).

ok?

Index: sched_bsd.c
===
RCS file: /cvs/src/sys/kern/sched_bsd.c,v
retrieving revision 1.86
diff -u -p -r1.86 sched_bsd.c
--- sched_bsd.c 10 Sep 2023 03:08:05 -  1.86
+++ sched_bsd.c 16 Sep 2023 16:24:33 -
@@ -117,9 +117,9 @@ roundrobin(struct clockintr *cl, void *c
  * 1, 5, and 15 minute intervals.
  */
 void
-update_loadavg(void *arg)
+update_loadavg(void *unused)
 {
-   struct timeout *to = (struct timeout *)arg;
+   static struct timeout to = TIMEOUT_INITIALIZER(update_loadavg, NULL);
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
u_int i, nrun = 0;
@@ -135,7 +135,7 @@ update_loadavg(void *arg)
nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT;
}
 
-   timeout_add_sec(to, 5);
+   timeout_add_sec(, 5);
 }
 
 /*
@@ -227,9 +227,9 @@ fixpt_t ccpu = 0.95122942450071400909 * 
  * Recompute process priorities, every second.
  */
 void
-schedcpu(void *arg)
+schedcpu(void *unused)
 {
-   struct timeout *to = (struct timeout *)arg;
+   static struct timeout to = TIMEOUT_INITIALIZER(schedcpu, NULL);
fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
struct proc *p;
int s;
@@ -280,7 +280,7 @@ schedcpu(void *arg)
SCHED_UNLOCK(s);
}
wakeup();
-   timeout_add_sec(to, 1);
+   timeout_add_sec(, 1);
 }
 
 /*
@@ -726,23 +726,14 @@ sysctl_hwperfpolicy(void *oldp, size_t *
 }
 #endif
 
+/*
+ * Start the scheduler's periodic timeouts.
+ */
 void
 scheduler_start(void)
 {
-   static struct timeout schedcpu_to;
-   static struct timeout loadavg_to;
-
-   /*
-* We avoid polluting the global namespace by keeping the scheduler
-* timeouts static in this function.
-* We setup the timeout here and kick schedcpu once to make it do
-* its job.
-*/
-   timeout_set(_to, schedcpu, _to);
-   timeout_set(_to, update_loadavg, _to);
-
-   schedcpu(_to);
-   update_loadavg(_to);
+   schedcpu(NULL);
+   update_loadavg(NULL);
 
 #ifndef SMALL_KERNEL
if (perfpolicy == PERFPOL_AUTO)



ksh(1): implement p_tv() with p_ts()

2023-09-11 Thread Scott Cheloha
p_tv() is identical to p_ts() in every way except for the subsecond
conversion constants.

Better to write p_ts() once: in p_tv(), convert from timeval to
timespec and call p_ts().

ok?

Index: c_sh.c
===
RCS file: /cvs/src/bin/ksh/c_sh.c,v
retrieving revision 1.64
diff -u -p -r1.64 c_sh.c
--- c_sh.c  22 May 2020 07:50:07 -  1.64
+++ c_sh.c  12 Sep 2023 03:07:16 -
@@ -680,14 +680,10 @@ static void
 p_tv(struct shf *shf, int posix, struct timeval *tv, int width, char *prefix,
 char *suffix)
 {
-   if (posix)
-   shf_fprintf(shf, "%s%*lld.%02ld%s", prefix ? prefix : "",
-   width, (long long)tv->tv_sec, tv->tv_usec / 1, suffix);
-   else
-   shf_fprintf(shf, "%s%*lldm%02lld.%02lds%s", prefix ? prefix : 
"",
-   width, (long long)tv->tv_sec / 60,
-   (long long)tv->tv_sec % 60,
-   tv->tv_usec / 1, suffix);
+   struct timespec ts;
+
+   TIMEVAL_TO_TIMESPEC(tv, );
+   p_ts(shf, posix, , width, prefix, suffix);
 }
 
 static void



clockintr: replace CL_RNDSTAT with global variable "statclock_is_randomized"

2023-09-09 Thread Scott Cheloha
I'm going to break the big statclock() patch on tech@ into smaller
chunks that are easier to review.

The goal here is to move control of statclock() out of the clock
interrupt subsystem and transform it into a client of that subsystem.

I think we can do this in four parts.  Part 3 is the most complex.

1. Replace the CL_RNDSTAT flag with a new global variable,
   "statclock_is_randomized".

2. Add clockintr_advance_random() to the public sys/clockintr.h API
   so statclock() can use it.

3. Merge the contents of clockintr_statclock() into statclock(); make
   statclock() a real callback function that reschedules itself and
   transparently handles multiple expirations.

4. Move control of the statclock clockintr handle from the clock
   interrupt subsystem to the scheduler.

Attached is a patch for step 1.

In order to isolate the statclock() from the clock interrupt subsystem
we need to replace the CL_RNDSTAT flag with something equivalent.
This patch adds a new global variable, "statclock_is_randomized", to
kern_clock.c and prototypes it in sys/systm.h.  All CL_RNDSTAT checks
are replaced with "statclock_is_randomized" checks and instead of
passing CL_RNDSTAT to clockintr_init() we set "statclock_is_randomized".

ok?

Index: kern/kern_clock.c
===
RCS file: /cvs/src/sys/kern/kern_clock.c,v
retrieving revision 1.116
diff -u -p -r1.116 kern_clock.c
--- kern/kern_clock.c   9 Sep 2023 18:19:03 -   1.116
+++ kern/kern_clock.c   9 Sep 2023 19:03:13 -
@@ -86,6 +86,8 @@ int   ticks = INT_MAX - (15 * 60 * HZ);
 /* Don't force early wrap around, triggers bug in inteldrm */
 volatile unsigned long jiffies;
 
+int statclock_is_randomized;   /* [I] fixed or pseudorandom period? */
+
 /*
  * Initialize clock frequencies and start both clocks running.
  */
Index: kern/kern_clockintr.c
===
RCS file: /cvs/src/sys/kern/kern_clockintr.c,v
retrieving revision 1.45
diff -u -p -r1.45 kern_clockintr.c
--- kern/kern_clockintr.c   9 Sep 2023 17:07:59 -   1.45
+++ kern/kern_clockintr.c   9 Sep 2023 19:03:13 -
@@ -167,7 +167,7 @@ clockintr_cpu_init(const struct intrcloc
 * We can always advance the statclock.  There is no reason to
 * stagger a randomized statclock.
 */
-   if (!ISSET(clockintr_flags, CL_RNDSTAT)) {
+   if (!statclock_is_randomized) {
if (cq->cq_statclock->cl_expiration == 0) {
clockintr_stagger(cq->cq_statclock, statclock_avg,
multiplier, MAXCPUS);
@@ -466,7 +466,7 @@ clockintr_statclock(struct clockintr *cl
 {
uint64_t count, i;
 
-   if (ISSET(clockintr_flags, CL_RNDSTAT)) {
+   if (statclock_is_randomized) {
count = clockintr_advance_random(cl, statclock_min,
statclock_mask);
} else {
Index: sys/clockintr.h
===
RCS file: /cvs/src/sys/sys/clockintr.h,v
retrieving revision 1.12
diff -u -p -r1.12 clockintr.h
--- sys/clockintr.h 6 Sep 2023 02:33:18 -   1.12
+++ sys/clockintr.h 9 Sep 2023 19:03:14 -
@@ -114,8 +114,7 @@ struct clockintr_queue {
 #define CL_STATE_MASK  0x0001
 
 /* Global behavior flags. */
-#define CL_RNDSTAT 0x8000  /* randomized statclock */
-#define CL_FLAG_MASK   0x8000
+#define CL_FLAG_MASK   0x
 
 void clockintr_cpu_init(const struct intrclock *);
 int clockintr_dispatch(void *);
Index: sys/systm.h
===
RCS file: /cvs/src/sys/sys/systm.h,v
retrieving revision 1.165
diff -u -p -r1.165 systm.h
--- sys/systm.h 23 Aug 2023 01:55:45 -  1.165
+++ sys/systm.h 9 Sep 2023 19:03:14 -
@@ -234,6 +234,7 @@ int tstohz(const struct timespec *);
 void   realitexpire(void *);
 
 extern uint32_t hardclock_period;
+extern int statclock_is_randomized;
 
 struct clockframe;
 void   hardclock(struct clockframe *);
Index: arch/amd64/amd64/lapic.c
===
RCS file: /cvs/src/sys/arch/amd64/amd64/lapic.c,v
retrieving revision 1.69
diff -u -p -r1.69 lapic.c
--- arch/amd64/amd64/lapic.c23 Aug 2023 01:55:46 -  1.69
+++ arch/amd64/amd64/lapic.c9 Sep 2023 19:03:14 -
@@ -498,7 +498,8 @@ lapic_initclocks(void)
 
stathz = hz;
profhz = stathz * 10;
-   clockintr_init(CL_RNDSTAT);
+   statclock_is_randomized = 1;
+   clockintr_init(0);
 }
 
 
Index: arch/arm64/dev/agtimer.c
===
RCS file: /cvs/src/sys/arch/arm64/dev/agtimer.c,v
retrieving revision 1.26
diff -u -p -r1.26 agtimer.c
--- arch/arm64/dev/agtimer.c23 Aug 2023 01:55:46 -  1.26
+++ arch/arm64/dev/agtimer.c9 Sep 2023 19:03:14 -
@@ -293,7 +293,8 @@ 

kernel: remove schedhz

2023-09-08 Thread Scott Cheloha
mpi@ notes that alpha doesn't set schedhz anymore, so schedhz is
always zero and serves no purpose.

We could remove it (patch below).  Or we could wait a bit to see if
schedclock() finds a future as an independent clock interrupt.

My guess is that roundrobin() (or something like it) will have a
future as a deadline clock interrupt -- when it fires, the running
thread is preempted if it is in userspace.  Or something like that.

I don't know about schedclock(), though.  I have a hard time imagining
the need to adjust the priority of the running thread *during* the
clock interrupt.  Even as I write that down, it sounds like a hack.

This patch deletes the variable and its sole remaining reference in
statclock().  Note that schedclock() itself remains in place and still
runs at its default frequency, approximately (stathz / 8).

ok?  wait a bit?

Index: sys/sched.h
===
RCS file: /cvs/src/sys/sys/sched.h,v
retrieving revision 1.61
diff -u -p -r1.61 sched.h
--- sys/sched.h 11 Aug 2023 22:02:50 -  1.61
+++ sys/sched.h 8 Sep 2023 18:28:32 -
@@ -146,7 +146,6 @@ struct cpustats {
 #defineESTCPULIM(e) min((e), NICE_WEIGHT * PRIO_MAX - SCHED_PPQ)
 
 extern uint32_t roundrobin_period;
-extern int schedhz;/* ideally: 16 */
 
 struct proc;
 void schedclock(struct proc *);
Index: kern/kern_clock.c
===
RCS file: /cvs/src/sys/kern/kern_clock.c,v
retrieving revision 1.115
diff -u -p -r1.115 kern_clock.c
--- kern/kern_clock.c   23 Aug 2023 01:55:45 -  1.115
+++ kern/kern_clock.c   8 Sep 2023 18:28:32 -
@@ -79,7 +79,6 @@
  */
 
 intstathz;
-intschedhz;
 intprofhz;
 intprofprocs;
 intticks = INT_MAX - (15 * 60 * HZ);
@@ -295,13 +294,10 @@ statclock(struct clockframe *frame)
if (p != NULL) {
p->p_cpticks++;
/*
-* If no schedclock is provided, call it here at ~~12-25 Hz;
-* ~~16 Hz is best
+* schedclock() runs every eighth statclock().
 */
-   if (schedhz == 0) {
-   if ((++spc->spc_schedticks & 3) == 0)
-   schedclock(p);
-   }
+   if ((++spc->spc_schedticks & 3) == 0)
+   schedclock(p);
}
 }
 



Re: Use counters_read(9) from ddb(4)

2023-09-06 Thread Scott Cheloha
On Wed, Sep 06, 2023 at 01:04:19PM +0100, Martin Pieuchot wrote:
> Debugging OOM is hard.  UVM uses per-CPU counters and sadly
> counters_read(9) needs to allocate memory.  This is not acceptable in
> ddb(4).  As a result I cannot see the content of UVM counters in OOM
> situations.
> 
> Diff below introduces a *_static() variant of counters_read(9) that
> takes a secondary buffer to avoid calling malloc(9).  Is it fine?  Do
> you have a better idea?  Should we make it the default or using the
> stack might be a problem?

Instead of adding a second interface I think we could get away with
just extending counters_read(9) to take a scratch buffer as an optional
fourth parameter:

void
counters_read(struct cpumem *cm, uint64_t *output, unsigned int n,
uint64_t *scratch);

"scratch"?  "temp"?  "tmp"?

Anyway, a NULL scratch means "allocate this for me", otherwise you're
saying you've brought your own.  Obviously the contents of scratch are
undefined upon return.

This kinda looks like a case where we could annotate these pointers
with 'restrict', but I have never fully understood when 'restrict' is
appropriate vs. when it is overkill or useless.

Index: ./kern/subr_percpu.c
===
RCS file: /cvs/src/sys/kern/subr_percpu.c,v
retrieving revision 1.10
diff -u -p -r1.10 subr_percpu.c
--- ./kern/subr_percpu.c3 Oct 2022 14:10:53 -   1.10
+++ ./kern/subr_percpu.c6 Sep 2023 17:18:46 -
@@ -159,17 +159,19 @@ counters_free(struct cpumem *cm, unsigne
 }
 
 void
-counters_read(struct cpumem *cm, uint64_t *output, unsigned int n)
+counters_read(struct cpumem *cm, uint64_t *output, unsigned int n,
+uint64_t *scratch)
 {
struct cpumem_iter cmi;
-   uint64_t *gen, *counters, *temp;
+   uint64_t *gen, *counters, *temp = scratch;
uint64_t enter, leave;
unsigned int i;
 
for (i = 0; i < n; i++)
output[i] = 0;
 
-   temp = mallocarray(n, sizeof(uint64_t), M_TEMP, M_WAITOK);
+   if (scratch == NULL)
+   temp = mallocarray(n, sizeof(uint64_t), M_TEMP, M_WAITOK);
 
gen = cpumem_first(, cm);
do {
@@ -202,7 +204,8 @@ counters_read(struct cpumem *cm, uint64_
gen = cpumem_next(, cm);
} while (gen != NULL);
 
-   free(temp, M_TEMP, n * sizeof(uint64_t));
+   if (scratch == NULL)
+   free(temp, M_TEMP, n * sizeof(uint64_t));
 }
 
 void
@@ -305,7 +308,8 @@ counters_free(struct cpumem *cm, unsigne
 }
 
 void
-counters_read(struct cpumem *cm, uint64_t *output, unsigned int n)
+counters_read(struct cpumem *cm, uint64_t *output, uint64_t *scratch,
+unsigned int n)
 {
uint64_t *counters;
unsigned int i;
Index: ./kern/kern_sysctl.c
===
RCS file: /cvs/src/sys/kern/kern_sysctl.c,v
retrieving revision 1.418
diff -u -p -r1.418 kern_sysctl.c
--- ./kern/kern_sysctl.c16 Jul 2023 03:01:31 -  1.418
+++ ./kern/kern_sysctl.c6 Sep 2023 17:18:47 -
@@ -519,7 +519,7 @@ kern_sysctl(int *name, u_int namelen, vo
unsigned int i;
 
memset(, 0, sizeof(mbs));
-   counters_read(mbstat, counters, MBSTAT_COUNT);
+   counters_read(mbstat, counters, nitems(counters), NULL);
for (i = 0; i < MBSTAT_TYPES; i++)
mbs.m_mtypes[i] = counters[i];
 
Index: ./kern/subr_evcount.c
===
RCS file: /cvs/src/sys/kern/subr_evcount.c,v
retrieving revision 1.15
diff -u -p -r1.15 subr_evcount.c
--- ./kern/subr_evcount.c   5 Dec 2022 08:58:49 -   1.15
+++ ./kern/subr_evcount.c   6 Sep 2023 17:18:47 -
@@ -101,7 +101,7 @@ evcount_sysctl(int *name, u_int namelen,
 {
int error = 0, s, nintr, i;
struct evcount *ec;
-   u_int64_t count;
+   u_int64_t count, scratch;
 
if (newp != NULL)
return (EPERM);
@@ -129,7 +129,7 @@ evcount_sysctl(int *name, u_int namelen,
if (ec == NULL)
return (ENOENT);
if (ec->ec_percpu != NULL) {
-   counters_read(ec->ec_percpu, , 1);
+   counters_read(ec->ec_percpu, , 1, );
} else {
s = splhigh();
count = ec->ec_count;
Index: ./net/if.c
===
RCS file: /cvs/src/sys/net/if.c,v
retrieving revision 1.707
diff -u -p -r1.707 if.c
--- ./net/if.c  18 Aug 2023 08:10:16 -  1.707
+++ ./net/if.c  6 Sep 2023 17:18:49 -
@@ -2797,7 +2797,8 @@ if_getdata(struct ifnet *ifp, struct if_
if (ifp->if_counters != NULL) {
uint64_t counters[ifc_ncounters];
 
-   counters_read(ifp->if_counters, counters, nitems(counters));
+   counters_read(ifp->if_counters, 

clockintr: add clockintr_advance_random()

2023-09-05 Thread Scott Cheloha
mpi@ suggests folding the pseudorandom advance code from
clockintr_statclock() into the clockintr API itself.  This replaces
three API calls -- clockintr_expiration(), clockintr_nsecuptime(), and
clockintr_schedule() -- we just one call to a new function,
clockintr_advance_random().

I'm fine with it.  A pseudorandom period is an odd thing and
supporting it is difficult.  Having a single bespoke API to support it
might be the lesser of two evils.

With this in place, the statclock() patch on tech@ can be simplified.

ok?

Index: kern_clockintr.c
===
RCS file: /cvs/src/sys/kern/kern_clockintr.c,v
retrieving revision 1.33
diff -u -p -r1.33 kern_clockintr.c
--- kern_clockintr.c26 Aug 2023 22:21:00 -  1.33
+++ kern_clockintr.c5 Sep 2023 14:11:38 -
@@ -42,8 +42,8 @@ uint32_t statclock_avg;   /* [I] average
 uint32_t statclock_min;/* [I] minimum statclock period 
(ns) */
 uint32_t statclock_mask;   /* [I] set of allowed offsets */
 
+uint64_t clockintr_advance_random(struct clockintr *, uint64_t, uint32_t);
 void clockintr_cancel_locked(struct clockintr *);
-uint64_t clockintr_expiration(const struct clockintr *);
 void clockintr_hardclock(struct clockintr *, void *);
 uint64_t clockintr_nsecuptime(const struct clockintr *);
 void clockintr_schedule(struct clockintr *, uint64_t);
@@ -345,6 +345,30 @@ clockintr_advance(struct clockintr *cl, 
return count;
 }
 
+/*
+ * Custom version of clockintr_advance() to support a pseudorandom
+ * statclock() period.  Hopefully we can throw this out at some point
+ * in the future.
+ */
+uint64_t
+clockintr_advance_random(struct clockintr *cl, uint64_t lo, uint32_t mask)
+{
+   uint64_t count = 0;
+   struct clockintr_queue *cq = cl->cl_queue;
+   uint32_t off;
+
+   KASSERT(cl == >cq_shadow);
+
+   while (cl->cl_expiration <= cq->cq_uptime) {
+   while ((off = (random() & mask)) == 0)
+   continue;
+   cl->cl_expiration += lo + off;
+   count++;
+   }
+   SET(cl->cl_flags, CLST_SHADOW_PENDING);
+   return count;
+}
+
 void
 clockintr_cancel(struct clockintr *cl)
 {
@@ -402,21 +426,6 @@ clockintr_establish(struct clockintr_que
return cl;
 }
 
-uint64_t
-clockintr_expiration(const struct clockintr *cl)
-{
-   uint64_t expiration;
-   struct clockintr_queue *cq = cl->cl_queue;
-
-   if (cl == >cq_shadow)
-   return cl->cl_expiration;
-
-   mtx_enter(>cq_mtx);
-   expiration = cl->cl_expiration;
-   mtx_leave(>cq_mtx);
-   return expiration;
-}
-
 void
 clockintr_schedule(struct clockintr *cl, uint64_t expiration)
 {
@@ -478,13 +487,6 @@ clockintr_stagger(struct clockintr *cl, 
mtx_leave(>cq_mtx);
 }
 
-uint64_t
-clockintr_nsecuptime(const struct clockintr *cl)
-{
-   KASSERT(cl == >cl_queue->cq_shadow);
-   return cl->cl_queue->cq_uptime;
-}
-
 void
 clockintr_hardclock(struct clockintr *cl, void *frame)
 {
@@ -498,20 +500,11 @@ clockintr_hardclock(struct clockintr *cl
 void
 clockintr_statclock(struct clockintr *cl, void *frame)
 {
-   uint64_t count, expiration, i, uptime;
-   uint32_t off;
+   uint64_t count, i;
 
if (ISSET(clockintr_flags, CL_RNDSTAT)) {
-   count = 0;
-   expiration = clockintr_expiration(cl);
-   uptime = clockintr_nsecuptime(cl);
-   while (expiration <= uptime) {
-   while ((off = (random() & statclock_mask)) == 0)
-   continue;
-   expiration += statclock_min + off;
-   count++;
-   }
-   clockintr_schedule(cl, expiration);
+   count = clockintr_advance_random(cl, statclock_min,
+   statclock_mask);
} else {
count = clockintr_advance(cl, statclock_avg);
}



Re: clockintr: move control of statclock() into scheduler

2023-09-04 Thread Scott Cheloha
On Thu, Aug 31, 2023 at 04:01:35PM -0500, Scott Cheloha wrote:
> This is the next patch in the clock interrupt policy reorganization
> series.
> 
> While the hardclock/dt(4) patch is being rewritten we can do this
> orthogonal statclock() patch.  It needs to get done at some point
> anyway, may as well do it now.
> 
> So, this patch moves most of the statclock() code out of the clockintr
> layer and cedes control of the statclock() to the scheduler.  My thinking
> is: (a) statclock() increments p_cpticks and calls schedclock(), so in
> practice it is a scheduler interrupt, and (b) in the future it would be
> nice if the scheduler could disable the statclock when a CPU is very idle
> and maybe save some power.
> 
> All of this should feel familiar.  It is equivalent to what we just
> did to roundrobin().
> 
> - Move the contents of the clockintr_statclock() wrapper function
>   into statclock() and make statclock() a real clockintr callback.
> 
> - clockintr_expiration(), clockintr_nsecuptime(), and
>   clockintr_schedule() all become public sys/clockintr.h
>   interfaces for use in statclock().
> 
> - Tweak statclock() to handle multiple expirations at once.
> 
> - Move the statclock handle from clockintr_queue (cq_statclock) to
>   schedstate_percpu (spc_statclock).  Establish spc_statclock during
>   sched_init_cpu().
> 
> - Move the statclock variables from kern_clockintr.c to kern_clock.c.
>   Move statclock variable initialization from clockintr_init() forward
>   into initclocks().
> 
> - Replace the CL_RNDSTAT flag with a new global boolean,
>   statclock_is_randomized.  Update clockintr_init() callers to set
>   statclock_is_randomized instead of passing CL_RNDSTAT. 

Ping.

ok?

Index: kern/kern_clock.c
===
RCS file: /cvs/src/sys/kern/kern_clock.c,v
retrieving revision 1.115
diff -u -p -r1.115 kern_clock.c
--- kern/kern_clock.c   23 Aug 2023 01:55:45 -  1.115
+++ kern/kern_clock.c   5 Sep 2023 01:13:22 -
@@ -39,6 +39,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -87,17 +88,42 @@ int ticks = INT_MAX - (15 * 60 * HZ);
 /* Don't force early wrap around, triggers bug in inteldrm */
 volatile unsigned long jiffies;
 
+uint32_t statclock_avg;/* [I] average statclock period (ns) */
+uint32_t statclock_min;/* [I] minimum statclock period (ns) */
+uint32_t statclock_mask;   /* [I] set of statclock_min offsets */
+int statclock_is_randomized;   /* [I] fixed or pseudo-random period */
+
 /*
  * Initialize clock frequencies and start both clocks running.
  */
 void
 initclocks(void)
 {
+   uint32_t half_avg, var;
+
/*
 * Let the machine-specific code do its bit.
 */
cpu_initclocks();
 
+   KASSERT(stathz >= 1 && stathz <= 10);
+
+   /*
+* Compute the average statclock() period.  Then find var, the
+* largest power of two such that var <= statclock_avg / 2.
+*/
+   statclock_avg = 10 / stathz;
+   half_avg = statclock_avg / 2;
+   for (var = 1U << 31; var > half_avg; var /= 2)
+   continue;
+
+   /*
+* Set a lower bound for the range using statclock_avg and var.
+* The mask for that range is just (var - 1).
+*/
+   statclock_min = statclock_avg - (var / 2);
+   statclock_mask = var - 1;
+
KASSERT(profhz >= stathz && profhz <= 10);
KASSERT(profhz % stathz == 0);
profclock_period = 10 / profhz;
@@ -246,12 +272,30 @@ stopprofclock(struct process *pr)
  * do process and kernel statistics.
  */
 void
-statclock(struct clockframe *frame)
+statclock(struct clockintr *cl, void *cf)
 {
+   uint64_t count, expiration, i, uptime;
+   struct clockframe *frame = cf;
struct cpu_info *ci = curcpu();
struct schedstate_percpu *spc = >ci_schedstate;
struct proc *p = curproc;
struct process *pr;
+   uint32_t off;
+
+   if (statclock_is_randomized) {
+   count = 0;
+   expiration = clockintr_expiration(cl);
+   uptime = clockintr_nsecuptime(cl);
+   while (expiration <= uptime) {
+   while ((off = (random() & statclock_mask)) == 0)
+   continue;
+   expiration += statclock_min + off;
+   count++;
+   }
+   clockintr_schedule(cl, expiration);
+   } else {
+   count = clockintr_advance(cl, statclock_avg);
+   }
 
if (CLKF_USERMODE(frame)) {
pr = p->p_p;
@@ -259,11 +303,11 @@ statclock(struct clockframe *frame)
 * Came from user mode; CPU was in user state.
 * If this

clockintr: move control of statclock() into scheduler

2023-08-31 Thread Scott Cheloha
This is the next patch in the clock interrupt policy reorganization
series.

While the hardclock/dt(4) patch is being rewritten we can do this
orthogonal statclock() patch.  It needs to get done at some point
anyway, may as well do it now.

So, this patch moves most of the statclock() code out of the clockintr
layer and cedes control of the statclock() to the scheduler.  My thinking
is: (a) statclock() increments p_cpticks and calls schedclock(), so in
practice it is a scheduler interrupt, and (b) in the future it would be
nice if the scheduler could disable the statclock when a CPU is very idle
and maybe save some power.

All of this should feel familiar.  It is equivalent to what we just
did to roundrobin().

- Move the contents of the clockintr_statclock() wrapper function
  into statclock() and make statclock() a real clockintr callback.

- clockintr_expiration(), clockintr_nsecuptime(), and
  clockintr_schedule() all become public sys/clockintr.h
  interfaces for use in statclock().

- Tweak statclock() to handle multiple expirations at once.

- Move the statclock handle from clockintr_queue (cq_statclock) to
  schedstate_percpu (spc_statclock).  Establish spc_statclock during
  sched_init_cpu().

- Move the statclock variables from kern_clockintr.c to kern_clock.c.
  Move statclock variable initialization from clockintr_init() forward
  into initclocks().

- Replace the CL_RNDSTAT flag with a new global boolean,
  statclock_is_randomized.  Update clockintr_init() callers to set
  statclock_is_randomized instead of passing CL_RNDSTAT. 

ok?

Index: kern/kern_clock.c
===
RCS file: /cvs/src/sys/kern/kern_clock.c,v
retrieving revision 1.115
diff -u -p -r1.115 kern_clock.c
--- kern/kern_clock.c   23 Aug 2023 01:55:45 -  1.115
+++ kern/kern_clock.c   31 Aug 2023 19:49:27 -
@@ -39,6 +39,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -87,17 +88,42 @@ int ticks = INT_MAX - (15 * 60 * HZ);
 /* Don't force early wrap around, triggers bug in inteldrm */
 volatile unsigned long jiffies;
 
+uint32_t statclock_avg;/* [I] average statclock period (ns) */
+uint32_t statclock_min;/* [I] minimum statclock period (ns) */
+uint32_t statclock_mask;   /* [I] set of statclock_min offsets */
+int statclock_is_randomized;   /* [I] fixed or pseudo-random period */
+
 /*
  * Initialize clock frequencies and start both clocks running.
  */
 void
 initclocks(void)
 {
+   uint32_t half_avg, var;
+
/*
 * Let the machine-specific code do its bit.
 */
cpu_initclocks();
 
+   KASSERT(stathz >= 1 && stathz <= 10);
+
+   /*
+* Compute the average statclock() period.  Then find var, the
+* largest power of two such that var <= statclock_avg / 2.
+*/
+   statclock_avg = 10 / stathz;
+   half_avg = statclock_avg / 2;
+   for (var = 1U << 31; var > half_avg; var /= 2)
+   continue;
+
+   /*
+* Set a lower bound for the range using statclock_avg and var.
+* The mask for that range is just (var - 1).
+*/
+   statclock_min = statclock_avg - (var / 2);
+   statclock_mask = var - 1;
+
KASSERT(profhz >= stathz && profhz <= 10);
KASSERT(profhz % stathz == 0);
profclock_period = 10 / profhz;
@@ -246,12 +272,30 @@ stopprofclock(struct process *pr)
  * do process and kernel statistics.
  */
 void
-statclock(struct clockframe *frame)
+statclock(struct clockintr *cl, void *cf)
 {
+   uint64_t count, expiration, i, uptime;
+   struct clockframe *frame = cf;
struct cpu_info *ci = curcpu();
struct schedstate_percpu *spc = >ci_schedstate;
struct proc *p = curproc;
struct process *pr;
+   uint32_t off;
+
+   if (statclock_is_randomized) {
+   count = 0;
+   expiration = clockintr_expiration(cl);
+   uptime = clockintr_nsecuptime(cl);
+   while (expiration <= uptime) {
+   while ((off = (random() & statclock_mask)) == 0)
+   continue;
+   expiration += statclock_min + off;
+   count++;
+   }
+   clockintr_schedule(cl, expiration);
+   } else {
+   count = clockintr_advance(cl, statclock_avg);
+   }
 
if (CLKF_USERMODE(frame)) {
pr = p->p_p;
@@ -259,11 +303,11 @@ statclock(struct clockframe *frame)
 * Came from user mode; CPU was in user state.
 * If this process is being profiled record the tick.
 */
-   p->p_uticks++;
+   p->p_uticks += count;
if (pr->ps_nice > NZERO)
-   spc->spc_cp_time[CP_NICE]++;
+   spc->spc_cp_time[CP_NICE] += count;
else
-

sched_cpu_init: no need to NULL-check clockintr pointers

2023-08-31 Thread Scott Cheloha
We don't actually need these NULL-checks, my mistake.

sched_init_cpu() is only ever run once for a given cpu_info.

ok?

Index: kern_sched.c
===
RCS file: /cvs/src/sys/kern/kern_sched.c,v
retrieving revision 1.87
diff -u -p -r1.87 kern_sched.c
--- kern_sched.c29 Aug 2023 16:19:34 -  1.87
+++ kern_sched.c31 Aug 2023 15:24:02 -
@@ -88,26 +88,15 @@ sched_init_cpu(struct cpu_info *ci)
 
spc->spc_idleproc = NULL;
 
-   if (spc->spc_itimer == NULL) {
-   spc->spc_itimer = clockintr_establish(>ci_queue,
-   itimer_update);
-   if (spc->spc_itimer == NULL) {
-   panic("%s: clockintr_establish itimer_update",
-   __func__);
-   }
-   }
-   if (spc->spc_profclock == NULL) {
-   spc->spc_profclock = clockintr_establish(>ci_queue,
-   profclock);
-   if (spc->spc_profclock == NULL)
-   panic("%s: clockintr_establish profclock", __func__);
-   }
-   if (spc->spc_roundrobin == NULL) {
-   spc->spc_roundrobin = clockintr_establish(>ci_queue,
-   roundrobin);
-   if (spc->spc_roundrobin == NULL)
-   panic("%s: clockintr_establish roundrobin", __func__);
-   }
+   spc->spc_itimer = clockintr_establish(>ci_queue, itimer_update);
+   if (spc->spc_itimer == NULL)
+   panic("%s: clockintr_establish itimer_update", __func__);
+   spc->spc_profclock = clockintr_establish(>ci_queue, profclock);
+   if (spc->spc_profclock == NULL)
+   panic("%s: clockintr_establish profclock", __func__);
+   spc->spc_roundrobin = clockintr_establish(>ci_queue, roundrobin);
+   if (spc->spc_roundrobin == NULL)
+   panic("%s: clockintr_establish roundrobin", __func__);
 
kthread_create_deferred(sched_kthreads_create, ci);
 



Re: dt(4), hardclock(9): move interval, profile providers to dedicated callback

2023-08-25 Thread Scott Cheloha
On Thu, Aug 24, 2023 at 07:21:29PM +0200, Martin Pieuchot wrote:
> On 23/08/23(Wed) 18:52, Scott Cheloha wrote:
> > This is the next patch in the clock interrupt reorganization series.
> 
> Thanks for your diff.  I'm sorry but it is really hard for me to help
> review this diff because there is still no man page for this API+subsystem.
> 
> Can we start with that please?

Sure, a first draft of a clockintr_establish.9 manpage is included
below.

We also have a manpage in the tree, clockintr.9.  It is a bit out of
date, but it covers the broad strokes of how the driver-facing portion
of the subsystem works.

> > This patch moves the entry points for the interval and profile dt(4)
> > providers from the hardclock(9) to a dedicated clock interrupt
> > callback, dt_prov_profile_intr(), in dev/dt/dt_prov_profile.c.
> > 
> > - To preserve current behavior, (1) both provider entrypoints have
> >   been moved into a single callback function, (2) the interrupt runs at
> >   the same frequency as the hardclock, and (3) the interrupt is
> >   staggered to co-occur with the hardclock on a given CPU.
> 
> The only behavior that needs to be preserved is the output of dumping
> stacks.  That means DT_FA_PROFILE and DT_FA_STATIC certainly needs to
> be adapted with this change.  You can figure that out by looking at the
> output of /usr/src/share/btrace/kprofile.bt without and with this diff.
> 
> Please generate a FlameGraph to make sure they're still the same.

dt_prov_profile_intr() runs at the same stack depth as hardclock(), so
indeed they are still the same.

> Apart from that I'd prefer if we could skip the mechanical change and
> go straight to what dt(4) needs.  Otherwise we will have to re-design
> everything.

I think a mechanical "move the code from point A to point B" patch is
useful.  It makes the changes easier to follow when tracing the
revision history in the future.

If you insist on skipping it, though, I think I can live without it.

> If you don't want to do this work, then leave it and tell
> me what you need and what is your plan so I can help you and do it
> myself.

I am very keen to do this, or at least to help with it.

> dt(4) needs a way to schedule two different kind of periodic timeouts
> with the higher precision possible.  It is currently plugged to hardclock
> because there is nothing better.

Yes.

> The current code assumes the periodic entry points are external to dt(4).
> This diff moves them in the middle of dt(4) but keeps the existing flow
> which makes the code very convoluted.
> 
> A starting point to understand the added complexity it so see that the
> DT_ENTER() macro are no longer needed if we move the entry points inside
> dt(4).

I did see that.  It seems really easy to remove the macros in a
subsequent patch, though.

Again, if you want to do it all in one patch that's OK.

> The first periodic timeout is dt_prov_interval_enter().  It could be
> implemented as a per-PCB timeout_add_nsec(9).  The drawback of this
> approach is that it uses too much code in the kernel which is a problem
> when instrumenting the kernel itself.  Every subsystem used by dt(4) is
> impossible to instrument with btrace(8).

I think you can avoid this instrumentation problem by using clockintr,
where the callback functions are run from the hardware interrupt
context, just like hardclock().

> The second periodic timeout it dt_prov_profile_enter().  It is similar
> to the previous one and has to run on every CPU.
> 
> Both are currently bound to tick, but we want per-PCB time resolution.
> We can get rid of `dp_nticks' and `dp_maxtick' if we control when the
> timeouts fires.

My current thought is that each PCB could have its own "dp_period" or
"dp_interval", a count of nanoseconds.

> > - Each dt_pcb gets a provider-specific "dp_clockintr" pointer.  If the
> >   PCB's implementing provider needs a clock interrupt to do its work, it
> >   stores the handle in dp_clockintr.  The handle, if any, is established
> >   during dtpv_alloc() and disestablished during dtpv_dealloc().
> 
> Sorry, but as I said I don't understand what is a clockintr.  How does it
> fit in the kernel and how is it supposed to be used?

The goal of clockintr is to provide a machine-independent API for
scheduling clock interrupts.  You can use it to implement something
like hardclock() or statclock().  We are already using it to implement
these functions, among others.

> Why have it per PCB and not per provider or for the whole driver instead?
> Per-PCB implies that if I run 3 different profiling on a 32 CPU machines
> I now have 96 different clockintr.  Is it what we want?

Yes, I think that sounds fine.  If we run into scaling problems we can
always just change the un

dt(4), hardclock(9): move interval, profile providers to dedicated callback

2023-08-23 Thread Scott Cheloha
This is the next patch in the clock interrupt reorganization series.

This patch moves the entry points for the interval and profile dt(4)
providers from the hardclock(9) to a dedicated clock interrupt
callback, dt_prov_profile_intr(), in dev/dt/dt_prov_profile.c.

- To preserve current behavior, (1) both provider entrypoints have
  been moved into a single callback function, (2) the interrupt runs at
  the same frequency as the hardclock, and (3) the interrupt is
  staggered to co-occur with the hardclock on a given CPU.

  All of this can be changed later.  This patch is strictly
  logistical: moving these parts out of the hardclock.

- Each dt_pcb gets a provider-specific "dp_clockintr" pointer.  If the
  PCB's implementing provider needs a clock interrupt to do its work, it
  stores the handle in dp_clockintr.  The handle, if any, is established
  during dtpv_alloc() and disestablished during dtpv_dealloc().

  Only the interval and profile providers use it at present.

- In order to implement dt_prov_profile_dealloc() I needed a complete
  definition of struct dt_softc, so I hoisted it up out of dt_dev.c
  into dtvar.h.

- A PCB's periodic clock interrupt, if any, is started during
  dt_ioctl_record_start() stopped during dt_ioctl_record_stop().
  This is not a provider-agnostic approach, but I don't see where
  else to start/stop the clock interrupt.

  One alternative is to start running the clock interrupts when they
  are allocated in dtpv_alloc() and stop them when they are freed in
  dtpv_dealloc().  This is wasteful, though: the PCBs are not recording
  yet, so the interrupts won't perform any useful work until the
  controlling process enables recording.

  An additional pair of provider hooks, e.g. "dtpv_record_start" and
  "dtpv_record_stop", might resolve this.

- We haven't needed to destroy clock interrupts yet, so the
  clockintr_disestablish() function used in this patch is new.

  The implementation is extremely similar to that of clockintr_cancel().
  However, for sake of simplicity, a callback may not disestablish its
  clockintr while the callback is running.

  One tricky part: if a clockintr is disestablished while it is running,
  the dispatch thread takes care not to use-after-free when it re-enters
  the clockintr_queue mutex.

I have tested this on amd64.  It seems to work: the clock interrupts
fire, stack traces are recorded, and the handles are deallocated when
the process closes the file descriptor.

I will be testing it on other platforms over the next few days.

Thoughts?  Test results?

Index: kern/kern_clockintr.c
===
RCS file: /cvs/src/sys/kern/kern_clockintr.c,v
retrieving revision 1.32
diff -u -p -r1.32 kern_clockintr.c
--- kern/kern_clockintr.c   21 Aug 2023 17:22:04 -  1.32
+++ kern/kern_clockintr.c   23 Aug 2023 23:35:11 -
@@ -218,7 +218,7 @@ clockintr_dispatch(void *frame)
 {
uint64_t lateness, run = 0, start;
struct cpu_info *ci = curcpu();
-   struct clockintr *cl;
+   struct clockintr *cl, *shadow;
struct clockintr_queue *cq = >ci_queue;
u_int ogen;
 
@@ -257,22 +257,26 @@ clockintr_dispatch(void *frame)
break;
}
clockintr_cancel_locked(cl);
-   cq->cq_shadow.cl_expiration = cl->cl_expiration;
+   shadow = >cq_shadow;
+   shadow->cl_expiration = cl->cl_expiration;
+   shadow->cl_func = cl->cl_func;
cq->cq_running = cl;
mtx_leave(>cq_mtx);
 
-   cl->cl_func(>cq_shadow, frame);
+   shadow->cl_func(shadow, frame);
 
mtx_enter(>cq_mtx);
-   cq->cq_running = NULL;
-   if (ISSET(cl->cl_flags, CLST_IGNORE_SHADOW)) {
-   CLR(cl->cl_flags, CLST_IGNORE_SHADOW);
-   CLR(cq->cq_shadow.cl_flags, CLST_SHADOW_PENDING);
-   }
-   if (ISSET(cq->cq_shadow.cl_flags, CLST_SHADOW_PENDING)) {
-   CLR(cq->cq_shadow.cl_flags, CLST_SHADOW_PENDING);
-   clockintr_schedule_locked(cl,
-   cq->cq_shadow.cl_expiration);
+   if (cq->cq_running != NULL) {
+   cq->cq_running = NULL;
+   if (ISSET(cl->cl_flags, CLST_IGNORE_SHADOW)) {
+   CLR(cl->cl_flags, CLST_IGNORE_SHADOW);
+   CLR(shadow->cl_flags, CLST_SHADOW_PENDING);
+   }
+   if (ISSET(shadow->cl_flags, CLST_SHADOW_PENDING)) {
+   CLR(shadow->cl_flags, CLST_SHADOW_PENDING);
+   clockintr_schedule_locked(cl,
+   shadow->cl_expiration);
+   }
}
run++;
}
@@ -382,6 +386,34 @@ 

Re: all platforms: separate cpu_initclocks() from cpu_startclock()

2023-08-21 Thread Scott Cheloha
On Tue, Aug 22, 2023 at 11:28:25AM +0800, Kevin Lo wrote:
> On Mon, Aug 21, 2023 at 09:53:53PM -0500, Scott Cheloha wrote:
> > On Tue, Aug 22, 2023 at 02:36:31AM +, Mike Larkin wrote:
> > > On Mon, Aug 21, 2023 at 09:26:00PM -0500, Scott Cheloha wrote:
> > > > On Mon, Aug 21, 2023 at 10:10:58PM +, Mike Larkin wrote:
> > > > > > - alpha
> > > > > > - hppa
> > > > > > - m88k/luna88k
> > > > >
> > > > > if you are really interested in doing this [...]
> > > >
> > > > "really interested" is a bit strong.  As always, my primary goal is
> > > > not to break anything when I make a commit.
> > > >
> > > > The luna88k patch looks pretty straightfoward, but it's hard to be
> > > > completely sure I didn't screw something up.
> > > >
> > > > > [...] you could run this in nono since you're just looking for
> > > > > a compile/boot test.
> > > >
> > > > Apparently the license forbids redistribution.  Super annoying.
> > > >
> > > 
> > > so? install it, boot a luna88k "vm", test your diff, then you have your
> > > question answered. you aren't redistributing anything.
> > 
> > No, I mean that there is no binary for pkg_add, so I have to build it
> > by hand.  Unless I'm missing something?
> 
> Hi Scott,
> 
> You could install emulators/nono, which is luna88k emulator.

???

I just tried that, it says there is no such thing.

$ doas pkg_add nono
quirks-6.138 signed on 2023-08-20T20:51:44Z
Can't find nono
$ doas pkg_add emulators/nono
quirks-6.138 signed on 2023-08-20T20:51:44Z
file:emulators/: empty
Can't find nono

What am I missing here?  I'm really sorry, I'm stumped.



Re: all platforms: separate cpu_initclocks() from cpu_startclock()

2023-08-21 Thread Scott Cheloha
On Tue, Aug 22, 2023 at 02:36:31AM +, Mike Larkin wrote:
> On Mon, Aug 21, 2023 at 09:26:00PM -0500, Scott Cheloha wrote:
> > On Mon, Aug 21, 2023 at 10:10:58PM +, Mike Larkin wrote:
> > > On Sat, Aug 19, 2023 at 01:44:47PM -0500, Scott Cheloha wrote:
> > > > On Sun, Aug 13, 2023 at 01:48:21PM -0500, Scott Cheloha wrote:
> > > > > This is the next patch in the clock interrupt reorganization series.
> > > > >
> > > > > Before we continue breaking up the hardclock(9) we need to detour into
> > > > > the MD code.
> > > > >
> > > > > This patch divides the "initialization" parts of cpu_initclocks() from
> > > > > the "start the clock interrupt" parts.  Seprating the two parts leaves
> > > > > initclocks() an opportunity to prepare the primary CPU for clock
> > > > > interrupt dispatch in a machine-independent manner before actually
> > > > > pulling the trigger.  It's nearly impossible to do any MI setup during
> > > > > initclocks() because cpu_initclocks() does everything in one go: both
> > > > > initialization and kickoff are done when cpu_initclocks() returns.
> > > > >
> > > > > Many platforms have a "cpu_startclock()" function, so this patch takes
> > > > > that de facto standard and makes it a rule: cpu_startclock() is now
> > > > > required.  It is prototyped in sys/systm.h and every platform must
> > > > > implement it.
> > > > >
> > > > > The revised initclocks() sequence is then:
> > > > >
> > > > > 1. Call cpu_initclocks().  At minimum, cpu_initclocks() ensures
> > > > >hz, stathz, and profhz are initialized.  All the machine
> > > > >independent setup in step (2) (currently) depends upon
> > > > >these machine-dependent values.
> > > > >
> > > > > 2. Compute intervals using hz, stathz, and profhz.
> > > > >
> > > > >In a later step I will move the full contents of clockintr_init()
> > > > >up into initclocks() and get rid of clockintr_init() entirely.
> > > > >
> > > > > 3. Call cpu_startclock().  At minimum, cpu_startclock() starts the
> > > > >clock interrupt dispatch cycle on the primary CPU.
> > > > >
> > > > > I have compiled/booted this patch on amd64 (lapic path), arm64, i386
> > > > > (lapic path), macppc, octeon, and sparc64 (sun4v).
> > > > >
> > > > > I am looking for compile/boot tests on alpha, armv7, hppa, landisk,
> > > > > luna88k, powerpc64, and riscv64.  I think armv7 is the tricky one
> > > > > here.  Everything else is relatively straightforward, though I may
> > > > > have missed a few stray variables here or there.
> > > > >
> > > > > Test results?  Ok?
> > > >
> > > > Here is an updated patch that removes several MD prototypes for
> > > > cpu_startclock() that I missed the first time through.
> > > >
> > > > I went back and tested these again:
> > > >
> > > > - amd64 (lapic)
> > > > - arm64
> > > > - i386 (lapic)
> > > > - powerpc/macppc
> > > > - mips64/octeon (loongson should be fine)
> > > > - sparc64 (sys_tick; tick/stick should be fine)
> > > >
> > > > arm/armv7 and riscv64 were tested under the previous version, but I
> > > > would appreciate a second compile-test to make sure the header changes
> > > > in the updated patch did not break the build (CC phessler@, jsg@).
> > > >
> > > > I am still seeking compile/boot-tests for the following:
> > > >
> > > > - alpha
> > > > - hppa
> > > > - m88k/luna88k
> > >
> > > if you are really interested in doing this [...]
> >
> > "really interested" is a bit strong.  As always, my primary goal is
> > not to break anything when I make a commit.
> >
> > The luna88k patch looks pretty straightfoward, but it's hard to be
> > completely sure I didn't screw something up.
> >
> > > [...] you could run this in nono since you're just looking for
> > > a compile/boot test.
> >
> > Apparently the license forbids redistribution.  Super annoying.
> 
> so? install it, boot a luna88k "vm", test your diff, then you have your
> question answered. you aren't redistributing anything.

FWIW, I think vmctl/vmd have a nicer user interface.

I feel like I'm... boxing... with nono, not using it.



Re: all platforms: separate cpu_initclocks() from cpu_startclock()

2023-08-21 Thread Scott Cheloha
On Tue, Aug 22, 2023 at 02:36:31AM +, Mike Larkin wrote:
> On Mon, Aug 21, 2023 at 09:26:00PM -0500, Scott Cheloha wrote:
> > On Mon, Aug 21, 2023 at 10:10:58PM +, Mike Larkin wrote:
> > > On Sat, Aug 19, 2023 at 01:44:47PM -0500, Scott Cheloha wrote:
> > > > On Sun, Aug 13, 2023 at 01:48:21PM -0500, Scott Cheloha wrote:
> > > > > This is the next patch in the clock interrupt reorganization series.
> > > > >
> > > > > Before we continue breaking up the hardclock(9) we need to detour into
> > > > > the MD code.
> > > > >
> > > > > This patch divides the "initialization" parts of cpu_initclocks() from
> > > > > the "start the clock interrupt" parts.  Seprating the two parts leaves
> > > > > initclocks() an opportunity to prepare the primary CPU for clock
> > > > > interrupt dispatch in a machine-independent manner before actually
> > > > > pulling the trigger.  It's nearly impossible to do any MI setup during
> > > > > initclocks() because cpu_initclocks() does everything in one go: both
> > > > > initialization and kickoff are done when cpu_initclocks() returns.
> > > > >
> > > > > Many platforms have a "cpu_startclock()" function, so this patch takes
> > > > > that de facto standard and makes it a rule: cpu_startclock() is now
> > > > > required.  It is prototyped in sys/systm.h and every platform must
> > > > > implement it.
> > > > >
> > > > > The revised initclocks() sequence is then:
> > > > >
> > > > > 1. Call cpu_initclocks().  At minimum, cpu_initclocks() ensures
> > > > >hz, stathz, and profhz are initialized.  All the machine
> > > > >independent setup in step (2) (currently) depends upon
> > > > >these machine-dependent values.
> > > > >
> > > > > 2. Compute intervals using hz, stathz, and profhz.
> > > > >
> > > > >In a later step I will move the full contents of clockintr_init()
> > > > >up into initclocks() and get rid of clockintr_init() entirely.
> > > > >
> > > > > 3. Call cpu_startclock().  At minimum, cpu_startclock() starts the
> > > > >clock interrupt dispatch cycle on the primary CPU.
> > > > >
> > > > > I have compiled/booted this patch on amd64 (lapic path), arm64, i386
> > > > > (lapic path), macppc, octeon, and sparc64 (sun4v).
> > > > >
> > > > > I am looking for compile/boot tests on alpha, armv7, hppa, landisk,
> > > > > luna88k, powerpc64, and riscv64.  I think armv7 is the tricky one
> > > > > here.  Everything else is relatively straightforward, though I may
> > > > > have missed a few stray variables here or there.
> > > > >
> > > > > Test results?  Ok?
> > > >
> > > > Here is an updated patch that removes several MD prototypes for
> > > > cpu_startclock() that I missed the first time through.
> > > >
> > > > I went back and tested these again:
> > > >
> > > > - amd64 (lapic)
> > > > - arm64
> > > > - i386 (lapic)
> > > > - powerpc/macppc
> > > > - mips64/octeon (loongson should be fine)
> > > > - sparc64 (sys_tick; tick/stick should be fine)
> > > >
> > > > arm/armv7 and riscv64 were tested under the previous version, but I
> > > > would appreciate a second compile-test to make sure the header changes
> > > > in the updated patch did not break the build (CC phessler@, jsg@).
> > > >
> > > > I am still seeking compile/boot-tests for the following:
> > > >
> > > > - alpha
> > > > - hppa
> > > > - m88k/luna88k
> > >
> > > if you are really interested in doing this [...]
> >
> > "really interested" is a bit strong.  As always, my primary goal is
> > not to break anything when I make a commit.
> >
> > The luna88k patch looks pretty straightfoward, but it's hard to be
> > completely sure I didn't screw something up.
> >
> > > [...] you could run this in nono since you're just looking for
> > > a compile/boot test.
> >
> > Apparently the license forbids redistribution.  Super annoying.
> >
> 
> so? install it, boot a luna88k "vm", test your diff, then you have your
> question answered. you aren't redistributing anything.

No, I mean that there is no binary for pkg_add, so I have to build it
by hand.  Unless I'm missing something?



Re: all platforms: separate cpu_initclocks() from cpu_startclock()

2023-08-21 Thread Scott Cheloha
On Mon, Aug 21, 2023 at 10:10:58PM +, Mike Larkin wrote:
> On Sat, Aug 19, 2023 at 01:44:47PM -0500, Scott Cheloha wrote:
> > On Sun, Aug 13, 2023 at 01:48:21PM -0500, Scott Cheloha wrote:
> > > This is the next patch in the clock interrupt reorganization series.
> > >
> > > Before we continue breaking up the hardclock(9) we need to detour into
> > > the MD code.
> > >
> > > This patch divides the "initialization" parts of cpu_initclocks() from
> > > the "start the clock interrupt" parts.  Seprating the two parts leaves
> > > initclocks() an opportunity to prepare the primary CPU for clock
> > > interrupt dispatch in a machine-independent manner before actually
> > > pulling the trigger.  It's nearly impossible to do any MI setup during
> > > initclocks() because cpu_initclocks() does everything in one go: both
> > > initialization and kickoff are done when cpu_initclocks() returns.
> > >
> > > Many platforms have a "cpu_startclock()" function, so this patch takes
> > > that de facto standard and makes it a rule: cpu_startclock() is now
> > > required.  It is prototyped in sys/systm.h and every platform must
> > > implement it.
> > >
> > > The revised initclocks() sequence is then:
> > >
> > > 1. Call cpu_initclocks().  At minimum, cpu_initclocks() ensures
> > >hz, stathz, and profhz are initialized.  All the machine
> > >independent setup in step (2) (currently) depends upon
> > >these machine-dependent values.
> > >
> > > 2. Compute intervals using hz, stathz, and profhz.
> > >
> > >In a later step I will move the full contents of clockintr_init()
> > >up into initclocks() and get rid of clockintr_init() entirely.
> > >
> > > 3. Call cpu_startclock().  At minimum, cpu_startclock() starts the
> > >clock interrupt dispatch cycle on the primary CPU.
> > >
> > > I have compiled/booted this patch on amd64 (lapic path), arm64, i386
> > > (lapic path), macppc, octeon, and sparc64 (sun4v).
> > >
> > > I am looking for compile/boot tests on alpha, armv7, hppa, landisk,
> > > luna88k, powerpc64, and riscv64.  I think armv7 is the tricky one
> > > here.  Everything else is relatively straightforward, though I may
> > > have missed a few stray variables here or there.
> > >
> > > Test results?  Ok?
> >
> > Here is an updated patch that removes several MD prototypes for
> > cpu_startclock() that I missed the first time through.
> >
> > I went back and tested these again:
> >
> > - amd64 (lapic)
> > - arm64
> > - i386 (lapic)
> > - powerpc/macppc
> > - mips64/octeon (loongson should be fine)
> > - sparc64 (sys_tick; tick/stick should be fine)
> >
> > arm/armv7 and riscv64 were tested under the previous version, but I
> > would appreciate a second compile-test to make sure the header changes
> > in the updated patch did not break the build (CC phessler@, jsg@).
> >
> > I am still seeking compile/boot-tests for the following:
> >
> > - alpha
> > - hppa
> > - m88k/luna88k
> 
> if you are really interested in doing this [...]

"really interested" is a bit strong.  As always, my primary goal is
not to break anything when I make a commit.

The luna88k patch looks pretty straightfoward, but it's hard to be
completely sure I didn't screw something up.

> [...] you could run this in nono since you're just looking for
> a compile/boot test.

Apparently the license forbids redistribution.  Super annoying.

> > - powerpc64
> 
> builds and boots on powerpc64

Noted.  Thank you!



i386: i8254_initclocks: set IPL_MPSAFE for clock, rtc IRQs

2023-08-20 Thread Scott Cheloha
pOn amd64 we lie about the interrupts established during
i8254_initclocks().  We claim they are MP-safe in order to mollify a
KASSERT in intr_establish() and continue booting.

See amd64/isa/clock.c:
   279  void
   280  i8254_initclocks(void)
   281  {
   282  i8254_inittimecounter();/* hook the interrupt-based 
i8254 tc */
   283
   284  stathz = 128;
   285  profhz = 1024;  /* XXX does not divide into 1 billion */
   286  clockintr_init(0);
   287
   288  clockintr_cpu_init(NULL);
   289
   290  /*
   291   * While the clock interrupt handler isn't really MPSAFE, the
   292   * i8254 can't really be used as a clock on a true MP system.
   293   */
   294  isa_intr_establish(NULL, 0, IST_PULSE, IPL_CLOCK | IPL_MPSAFE,
   295  clockintr, 0, "clock");
   296  isa_intr_establish(NULL, 8, IST_PULSE, IPL_STATCLOCK | 
IPL_MPSAFE,
   297  rtcintr, 0, "rtc");

and amd64/amd64/intr.c:

   332  void *
   333  intr_establish(int legacy_irq, struct pic *pic, int pin, int type, int 
level,
   334  struct cpu_info *ci, int (*handler)(void *), void *arg, const char 
*what)
   335  {
   336  struct intrhand **p, *q, *ih;
   337  int slot, error, idt_vec;
   338  struct intrsource *source;
   339  struct intrstub *stubp;
   340  int flags;
   341
   342  #ifdef DIAGNOSTIC
   343  if (legacy_irq != -1 && (legacy_irq < 0 || legacy_irq > 15))
   344  panic("intr_establish: bad legacy IRQ value");
   345
   346  if (legacy_irq == -1 && pic == _pic)
   347  panic("intr_establish: non-legacy IRQ on i8259");
   348  #endif
   349
   350  flags = level & IPL_MPSAFE;
   351  level &= ~IPL_MPSAFE;
   352
   353  KASSERT(level <= IPL_TTY || level >= IPL_CLOCK || flags & 
IPL_MPSAFE);

Can we do the same on i386?  I'm trying to test the i8254 path on
modern hardware and I'm tripping the equivalent KASSERT in
apic_intr_establish().

See i386/i386/ioapic.c:

   661  void *
   662  apic_intr_establish(int irq, int type, int level, int (*ih_fun)(void *),
   663  void *ih_arg, const char *ih_what)
   664  {
   665  unsigned int ioapic = APIC_IRQ_APIC(irq);
   666  unsigned int intr = APIC_IRQ_PIN(irq);
   667  struct ioapic_softc *sc = ioapic_find(ioapic);
   668  struct ioapic_pin *pin;
   669  struct intrhand **p, *q, *ih;
   670  extern int cold;
   671  int minlevel, maxlevel;
   672  extern void intr_calculatemasks(void); /* XXX */
   673  int flags;
   674
   675  flags = level & IPL_MPSAFE;
   676  level &= ~IPL_MPSAFE;
   677
   678  KASSERT(level <= IPL_TTY || flags & IPL_MPSAFE);

The patch below lets me test the i8254 clockintr path on modern
hardware in 32-bit mode without needing to rototill the GENERIC
config to delete all the things that implicitly depend upon the
ioapic.

I don't think lying in this case is harmful.  We can only get to
i8254_initclocks() if we have no local APIC, or if
lapic_calibrate_timer() fails.

ok?

Index: clock.c
===
RCS file: /cvs/src/sys/arch/i386/isa/clock.c,v
retrieving revision 1.65
diff -u -p -r1.65 clock.c
--- clock.c 25 Jul 2023 18:16:20 -  1.65
+++ clock.c 21 Aug 2023 03:26:39 -
@@ -431,9 +431,9 @@ i8254_initclocks(void)
clockintr_cpu_init(NULL);
 
/* When using i8254 for clock, we also use the rtc for profclock */
-   (void)isa_intr_establish(NULL, 0, IST_PULSE, IPL_CLOCK,
+   (void)isa_intr_establish(NULL, 0, IST_PULSE, IPL_CLOCK | IPL_MPSAFE,
clockintr, 0, "clock");
-   (void)isa_intr_establish(NULL, 8, IST_PULSE, IPL_STATCLOCK,
+   (void)isa_intr_establish(NULL, 8, IST_PULSE, IPL_STATCLOCK | IPL_MPSAFE,
rtcintr, 0, "rtc");
 
rtcstart(); /* start the mc146818 clock */



Re: all platforms: separate cpu_initclocks() from cpu_startclock()

2023-08-19 Thread Scott Cheloha
On Sun, Aug 13, 2023 at 01:48:21PM -0500, Scott Cheloha wrote:
> This is the next patch in the clock interrupt reorganization series.
> 
> Before we continue breaking up the hardclock(9) we need to detour into
> the MD code.
> 
> This patch divides the "initialization" parts of cpu_initclocks() from
> the "start the clock interrupt" parts.  Seprating the two parts leaves
> initclocks() an opportunity to prepare the primary CPU for clock
> interrupt dispatch in a machine-independent manner before actually
> pulling the trigger.  It's nearly impossible to do any MI setup during
> initclocks() because cpu_initclocks() does everything in one go: both
> initialization and kickoff are done when cpu_initclocks() returns.
> 
> Many platforms have a "cpu_startclock()" function, so this patch takes
> that de facto standard and makes it a rule: cpu_startclock() is now
> required.  It is prototyped in sys/systm.h and every platform must
> implement it.
> 
> The revised initclocks() sequence is then:
> 
> 1. Call cpu_initclocks().  At minimum, cpu_initclocks() ensures
>hz, stathz, and profhz are initialized.  All the machine
>independent setup in step (2) (currently) depends upon
>these machine-dependent values.
> 
> 2. Compute intervals using hz, stathz, and profhz.
> 
>In a later step I will move the full contents of clockintr_init()
>up into initclocks() and get rid of clockintr_init() entirely.
> 
> 3. Call cpu_startclock().  At minimum, cpu_startclock() starts the
>clock interrupt dispatch cycle on the primary CPU.
> 
> I have compiled/booted this patch on amd64 (lapic path), arm64, i386
> (lapic path), macppc, octeon, and sparc64 (sun4v).
> 
> I am looking for compile/boot tests on alpha, armv7, hppa, landisk,
> luna88k, powerpc64, and riscv64.  I think armv7 is the tricky one
> here.  Everything else is relatively straightforward, though I may
> have missed a few stray variables here or there.
> 
> Test results?  Ok?

Here is an updated patch that removes several MD prototypes for
cpu_startclock() that I missed the first time through.

I went back and tested these again:

- amd64 (lapic)
- arm64
- i386 (lapic)
- powerpc/macppc
- mips64/octeon (loongson should be fine)
- sparc64 (sys_tick; tick/stick should be fine)

arm/armv7 and riscv64 were tested under the previous version, but I
would appreciate a second compile-test to make sure the header changes
in the updated patch did not break the build (CC phessler@, jsg@).

I am still seeking compile/boot-tests for the following:

- alpha
- hppa
- m88k/luna88k
- powerpc64
- sh/landisk

Test results?  Ok?

Index: kern/kern_clock.c
===
RCS file: /cvs/src/sys/kern/kern_clock.c,v
retrieving revision 1.113
diff -u -p -r1.113 kern_clock.c
--- kern/kern_clock.c   12 Aug 2023 13:19:28 -  1.113
+++ kern/kern_clock.c   19 Aug 2023 18:16:16 -
@@ -103,6 +103,9 @@ initclocks(void)
profclock_period = 10 / profhz;
 
inittimecounter();
+
+   /* Start dispatching clock interrupts on the primary CPU. */
+   cpu_startclock();
 }
 
 /*
Index: sys/systm.h
===
RCS file: /cvs/src/sys/sys/systm.h,v
retrieving revision 1.164
diff -u -p -r1.164 systm.h
--- sys/systm.h 5 Aug 2023 20:07:56 -   1.164
+++ sys/systm.h 19 Aug 2023 18:16:17 -
@@ -243,6 +243,7 @@ voidinitclocks(void);
 void   inittodr(time_t);
 void   resettodr(void);
 void   cpu_initclocks(void);
+void   cpu_startclock(void);
 
 void   startprofclock(struct process *);
 void   stopprofclock(struct process *);
Index: arch/alpha/alpha/clock.c
===
RCS file: /cvs/src/sys/arch/alpha/alpha/clock.c,v
retrieving revision 1.28
diff -u -p -r1.28 clock.c
--- arch/alpha/alpha/clock.c25 Jul 2023 18:16:19 -  1.28
+++ arch/alpha/alpha/clock.c19 Aug 2023 18:16:17 -
@@ -193,7 +193,11 @@ cpu_initclocks(void)
stathz = hz;
profhz = stathz;
clockintr_init(0);
+}
 
+void
+cpu_startclock(void)
+{
clockintr_cpu_init(NULL);
 
/*
Index: arch/amd64/amd64/machdep.c
===
RCS file: /cvs/src/sys/arch/amd64/amd64/machdep.c,v
retrieving revision 1.286
diff -u -p -r1.286 machdep.c
--- arch/amd64/amd64/machdep.c  27 Jul 2023 00:28:25 -  1.286
+++ arch/amd64/amd64/machdep.c  19 Aug 2023 18:16:18 -
@@ -227,6 +227,7 @@ paddr_t avail_end;
 
 void (*delay_func)(int) = i8254_delay;
 void (*initclock_func)(void) = i8254_initclocks;
+void (*startclock_func)(void) = i8254_start_both_clocks;
 
 /*
  * Format of boot information passed to us by 32-bit /boot
@@ -1878,6 +1879,12 @@ void
 cpu_i

all platforms: separate cpu_initclocks() from cpu_startclock()

2023-08-13 Thread Scott Cheloha
This is the next patch in the clock interrupt reorganization series.

Before we continue breaking up the hardclock(9) we need to detour into
the MD code.

This patch divides the "initialization" parts of cpu_initclocks() from
the "start the clock interrupt" parts.  Seprating the two parts leaves
initclocks() an opportunity to prepare the primary CPU for clock
interrupt dispatch in a machine-independent manner before actually
pulling the trigger.  It's nearly impossible to do any MI setup during
initclocks() because cpu_initclocks() does everything in one go: both
initialization and kickoff are done when cpu_initclocks() returns.

Many platforms have a "cpu_startclock()" function, so this patch takes
that de facto standard and makes it a rule: cpu_startclock() is now
required.  It is prototyped in sys/systm.h and every platform must
implement it.

The revised initclocks() sequence is then:

1. Call cpu_initclocks().  At minimum, cpu_initclocks() ensures
   hz, stathz, and profhz are initialized.  All the machine
   independent setup in step (2) (currently) depends upon
   these machine-dependent values.

2. Compute intervals using hz, stathz, and profhz.

   In a later step I will move the full contents of clockintr_init()
   up into initclocks() and get rid of clockintr_init() entirely.

3. Call cpu_startclock().  At minimum, cpu_startclock() starts the
   clock interrupt dispatch cycle on the primary CPU.

I have compiled/booted this patch on amd64 (lapic path), arm64, i386
(lapic path), macppc, octeon, and sparc64 (sun4v).

I am looking for compile/boot tests on alpha, armv7, hppa, landisk,
luna88k, powerpc64, and riscv64.  I think armv7 is the tricky one
here.  Everything else is relatively straightforward, though I may
have missed a few stray variables here or there.

Test results?  Ok?

Index: kern/kern_clock.c
===
RCS file: /cvs/src/sys/kern/kern_clock.c,v
retrieving revision 1.113
diff -u -p -r1.113 kern_clock.c
--- kern/kern_clock.c   12 Aug 2023 13:19:28 -  1.113
+++ kern/kern_clock.c   13 Aug 2023 18:45:30 -
@@ -103,6 +103,9 @@ initclocks(void)
profclock_period = 10 / profhz;
 
inittimecounter();
+
+   /* Start dispatching clock interrupts on the primary CPU. */
+   cpu_startclock();
 }
 
 /*
Index: sys/systm.h
===
RCS file: /cvs/src/sys/sys/systm.h,v
retrieving revision 1.164
diff -u -p -r1.164 systm.h
--- sys/systm.h 5 Aug 2023 20:07:56 -   1.164
+++ sys/systm.h 13 Aug 2023 18:45:30 -
@@ -243,6 +243,7 @@ voidinitclocks(void);
 void   inittodr(time_t);
 void   resettodr(void);
 void   cpu_initclocks(void);
+void   cpu_startclock(void);
 
 void   startprofclock(struct process *);
 void   stopprofclock(struct process *);
Index: arch/alpha/alpha/clock.c
===
RCS file: /cvs/src/sys/arch/alpha/alpha/clock.c,v
retrieving revision 1.28
diff -u -p -r1.28 clock.c
--- arch/alpha/alpha/clock.c25 Jul 2023 18:16:19 -  1.28
+++ arch/alpha/alpha/clock.c13 Aug 2023 18:45:30 -
@@ -143,7 +143,7 @@ clockattach(dev, fns)
  */
 
 /*
- * Start the real-time and statistics clocks.
+ * Measure and initialize clock frequencies.
  */
 void
 cpu_initclocks(void)
@@ -193,7 +193,14 @@ cpu_initclocks(void)
stathz = hz;
profhz = stathz;
clockintr_init(0);
+}
 
+/*
+ * Start the real-time and statistics clocks.
+ */
+void
+cpu_startclock(void)
+{
clockintr_cpu_init(NULL);
 
/*
Index: arch/amd64/amd64/machdep.c
===
RCS file: /cvs/src/sys/arch/amd64/amd64/machdep.c,v
retrieving revision 1.286
diff -u -p -r1.286 machdep.c
--- arch/amd64/amd64/machdep.c  27 Jul 2023 00:28:25 -  1.286
+++ arch/amd64/amd64/machdep.c  13 Aug 2023 18:45:31 -
@@ -227,6 +227,7 @@ paddr_t avail_end;
 
 void (*delay_func)(int) = i8254_delay;
 void (*initclock_func)(void) = i8254_initclocks;
+void (*startclock_func)(void) = i8254_start_both_clocks;
 
 /*
  * Format of boot information passed to us by 32-bit /boot
@@ -1878,6 +1879,12 @@ void
 cpu_initclocks(void)
 {
(*initclock_func)();
+}
+
+void
+cpu_startclock(void)
+{
+   (*startclock_func)();
 }
 
 void
Index: arch/amd64/amd64/lapic.c
===
RCS file: /cvs/src/sys/arch/amd64/amd64/lapic.c,v
retrieving revision 1.68
diff -u -p -r1.68 lapic.c
--- arch/amd64/amd64/lapic.c26 Apr 2023 10:52:55 -  1.68
+++ arch/amd64/amd64/lapic.c13 Aug 2023 18:45:31 -
@@ -499,8 +499,6 @@ lapic_initclocks(void)
stathz = hz;
profhz = stathz * 10;
clockintr_init(CL_RNDSTAT);
-
-   lapic_startclock();
 }
 
 
@@ -599,6 +597,7 @@ skip_calibration:
lapic_per_second * (1ULL << 32) / 10;

Re: hardclock(9), roundrobin: make roundrobin() an independent clock interrupt

2023-08-10 Thread Scott Cheloha
On Thu, Aug 10, 2023 at 07:32:05PM +0200, Martin Pieuchot wrote:
> On 10/08/23(Thu) 12:18, Scott Cheloha wrote:
> > On Thu, Aug 10, 2023 at 01:05:27PM +0200, Martin Pieuchot wrote:
> > [...] 
> > > Can we get rid of `hardclock_period' and use a variable set to 100ms?
> > > This should be tested on alpha which has a hz of 1024 but I'd argue this
> > > is an improvement.
> > 
> > Sure, that's cleaner.  The updated patch below adds a new
> > "roundrobin_period" variable initialized during clockintr_init().
> 
> I'd rather see this variable initialized in sched_bsd.c to 100ms without
> depending on `hz'.  Is is possible?  My point is to untangle this completely
> from `hz'.

Yes, but we need to do that in a separate patch.  This patch isolates
roundrobin() from hardclock() without changing any other behavior.

We can then use the isolated roundrobin() as a known-working starting
point for e.g.:

const uint32_t roundrobin_period = 1;   /* 100ms */

Index: kern/sched_bsd.c
===
RCS file: /cvs/src/sys/kern/sched_bsd.c,v
retrieving revision 1.79
diff -u -p -r1.79 sched_bsd.c
--- kern/sched_bsd.c5 Aug 2023 20:07:55 -   1.79
+++ kern/sched_bsd.c11 Aug 2023 02:47:03 -
@@ -54,9 +54,8 @@
 #include 
 #endif
 
-
+uint32_t roundrobin_period;/* [I] roundrobin period (ns) */
 intlbolt;  /* once a second sleep address */
-intrrticks_init;   /* # of hardclock ticks per roundrobin() */
 
 #ifdef MULTIPROCESSOR
 struct __mp_lock sched_lock;
@@ -69,21 +68,23 @@ uint32_tdecay_aftersleep(uint32_t, uin
  * Force switch among equal priority processes every 100ms.
  */
 void
-roundrobin(struct cpu_info *ci)
+roundrobin(struct clockintr *cl, void *cf)
 {
+   struct cpu_info *ci = curcpu();
struct schedstate_percpu *spc = >ci_schedstate;
+   uint64_t count;
 
-   spc->spc_rrticks = rrticks_init;
+   count = clockintr_advance(cl, roundrobin_period);
 
if (ci->ci_curproc != NULL) {
-   if (spc->spc_schedflags & SPCF_SEENRR) {
+   if (spc->spc_schedflags & SPCF_SEENRR || count >= 2) {
/*
 * The process has already been through a roundrobin
 * without switching and may be hogging the CPU.
 * Indicate that the process should yield.
 */
atomic_setbits_int(>spc_schedflags,
-   SPCF_SHOULDYIELD);
+   SPCF_SEENRR | SPCF_SHOULDYIELD);
} else {
atomic_setbits_int(>spc_schedflags,
SPCF_SEENRR);
@@ -695,8 +696,6 @@ scheduler_start(void)
 * its job.
 */
timeout_set(_to, schedcpu, _to);
-
-   rrticks_init = hz / 10;
schedcpu(_to);
 
 #ifndef SMALL_KERNEL
Index: kern/kern_sched.c
===
RCS file: /cvs/src/sys/kern/kern_sched.c,v
retrieving revision 1.84
diff -u -p -r1.84 kern_sched.c
--- kern/kern_sched.c   5 Aug 2023 20:07:55 -   1.84
+++ kern/kern_sched.c   11 Aug 2023 02:47:03 -
@@ -102,6 +102,12 @@ sched_init_cpu(struct cpu_info *ci)
if (spc->spc_profclock == NULL)
panic("%s: clockintr_establish profclock", __func__);
}
+   if (spc->spc_roundrobin == NULL) {
+   spc->spc_roundrobin = clockintr_establish(>ci_queue,
+   roundrobin);
+   if (spc->spc_roundrobin == NULL)
+   panic("%s: clockintr_establish roundrobin", __func__);
+   }
 
kthread_create_deferred(sched_kthreads_create, ci);
 
Index: kern/kern_clockintr.c
===
RCS file: /cvs/src/sys/kern/kern_clockintr.c,v
retrieving revision 1.30
diff -u -p -r1.30 kern_clockintr.c
--- kern/kern_clockintr.c   5 Aug 2023 20:07:55 -   1.30
+++ kern/kern_clockintr.c   11 Aug 2023 02:47:03 -
@@ -69,6 +69,7 @@ clockintr_init(u_int flags)
 
KASSERT(hz > 0 && hz <= 10);
hardclock_period = 10 / hz;
+   roundrobin_period = hardclock_period * 10;
 
KASSERT(stathz >= 1 && stathz <= 10);
 
@@ -204,6 +205,11 @@ clockintr_cpu_init(const struct intrcloc
clockintr_stagger(spc->spc_profclock, profclock_period,
multiplier, MAXCPUS);
}
+   if (spc->spc_roundrobin->cl_expiration == 0) {
+   clockintr_stagger(spc->spc_roundrobin, hardclock_period,
+   multiplier, MAXCPUS);
+   }
+   clockintr_advance(spc->spc_roundrobin, roundrobin_period);
 

Re: agtimer(4/arm64): simplify agtimer_delay()

2023-08-10 Thread Scott Cheloha
On Tue, Aug 08, 2023 at 08:00:47PM +0200, Mark Kettenis wrote:
> > From: Dale Rahn 
> > Date: Tue, 8 Aug 2023 12:36:45 -0400
> > 
> > Switching the computation of cycles/delaycnt to a proper 64 value math
> > instead of the '32 bit safe' complex math is likely a good idea.
> > However I am not completely convinced that switching to 'yield' (current
> > CPU_BUSY_CYCLE implementation) for every loop of a 'short wait' in the wait
> > loop makes sense. In a hypervisor environment, this could cause a very
> > short wait between register writes to become very long, In a non-hypervisor
> > environment there is essentially no improvement because the yield doesn't
> > really have any benefits on non-hypervisor.
> 
> Dale, I think you're confused here.  There is no arcitectural way to
> trap a YIELD instruction.  The docs explicitly state that SMT and SMP
> systems.  I suspect that this instruction was primarily introduced for
> SMT systems that never materialized; I completely believe you that on
> current hardware it does nothing.
> 
> Even without a YIELD instruction a hypervisor might interrupt us and
> schedule a different vCPU onto the core.  And on real hardware
> external interrupts may happen.  So you really can't count on delay(9)
> being accurate.
> 
> Linux does use YIELD in it delay loop.

Okay good.

> > To my current understanding, there is no useful 'wait short period' on arm
> > cores.
> 
> There is WFET (and WFIT), but that is Armv8.7 material, so not
> availble on actual hardware that we run on.  Linux uses that
> instruction in its delay loop when available.

These look great.  Can't wait for them to be implemented outside of a
functional simulator.

> On machines with a working Generic Timer event stream, Linux uses WFE
> if the delay is long enough.

I have a prototype for this.  Obviously it's a separate patch.

Anyway, can I go ahead with the patch below to start?

- Use 64-bit arithmetic to simplify agtimer_delay().

- Use CPU_BUSY_CYCLE() ("yield" on arm64) in the loop to match other
  delay(9) implementations.

Index: agtimer.c
===
RCS file: /cvs/src/sys/arch/arm64/dev/agtimer.c,v
retrieving revision 1.23
diff -u -p -r1.23 agtimer.c
--- agtimer.c   25 Jul 2023 18:16:19 -  1.23
+++ agtimer.c   10 Aug 2023 20:59:27 -
@@ -323,32 +323,12 @@ agtimer_cpu_initclocks(void)
 void
 agtimer_delay(u_int usecs)
 {
-   uint64_tclock, oclock, delta, delaycnt;
-   uint64_tcsec, usec;
-   volatile intj;
+   uint64_t cycles, start;
 
-   if (usecs > (0x8000 / agtimer_frequency)) {
-   csec = usecs / 1;
-   usec = usecs % 1;
-
-   delaycnt = (agtimer_frequency / 100) * csec +
-   (agtimer_frequency / 100) * usec / 1;
-   } else {
-   delaycnt = agtimer_frequency * usecs / 100;
-   }
-   if (delaycnt <= 1)
-   for (j = 100; j > 0; j--)
-   ;
-
-   oclock = agtimer_readcnt64();
-   while (1) {
-   for (j = 100; j > 0; j--)
-   ;
-   clock = agtimer_readcnt64();
-   delta = clock - oclock;
-   if (delta > delaycnt)
-   break;
-   }
+   start = agtimer_readcnt64();
+   cycles = (uint64_t)usecs * agtimer_frequency / 100;
+   while (agtimer_readcnt64() - start < cycles)
+   CPU_BUSY_CYCLE();
 }
 
 void



Re: hardclock(9), roundrobin: make roundrobin() an independent clock interrupt

2023-08-10 Thread Scott Cheloha
On Thu, Aug 10, 2023 at 01:05:27PM +0200, Martin Pieuchot wrote:
> On 05/08/23(Sat) 17:17, Scott Cheloha wrote:
> > This is the next piece of the clock interrupt reorganization patch
> > series.
> 
> The round robin logic is here to make sure process doesn't hog a CPU.
> The period to tell a process it should yield doesn't have to be tied
> to the hardclock period.  We want to be sure a process doesn't run more
> than 100ms at a time.

> Is the priority of this new clock interrupt the same as the hardlock?

Yes.  Clock interrupts on a given CPU are dispatched in order of
expiration.  If two clock interrupts on the same CPU have the same
expiration value they are dispatched in FIFO order.

> I don't understand what clockintr_advance() is doing.  Maybe you could
> write a manual for it?

clockintr_advance() is a convenience wrapper for clockintr_schedule().
It reschedules periodic interrupts without drift.

The manpage update is a work in progress.

> I'm afraid we could wait 200ms now?  Or what `count' of 2 mean?

No.  roundrobin() is still scheduled to run every 100ms.  The code
change ensures we properly account for situations where roundrobin()
is so late that two or more roundrobin periods have elapsed:

> @@ -69,21 +68,23 @@ uint32_t  decay_aftersleep(uint32_t, uin
>   * Force switch among equal priority processes every 100ms.
>   */
>  void
> -roundrobin(struct cpu_info *ci)
> +roundrobin(struct clockintr *cl, void *cf)
>  {
> + struct cpu_info *ci = curcpu();
>   struct schedstate_percpu *spc = >ci_schedstate;
> + uint64_t count;
>  
> - spc->spc_rrticks = rrticks_init;
> + count = clockintr_advance(cl, roundrobin_period);
>  
>   if (ci->ci_curproc != NULL) {
> - if (spc->spc_schedflags & SPCF_SEENRR) {
> + if (spc->spc_schedflags & SPCF_SEENRR || count >= 2) {
>   /*
>* The process has already been through a roundrobin
>* without switching and may be hogging the CPU.
>* Indicate that the process should yield.
>*/
>   atomic_setbits_int(>spc_schedflags,
> - SPCF_SHOULDYIELD);
> + SPCF_SEENRR | SPCF_SHOULDYIELD);
>   } else {
>   atomic_setbits_int(>spc_schedflags,
>   SPCF_SEENRR);

In such a situation, we want to set both SPCF_SEENRR and
SPCF_SHOULDYIELD on the thread.  This simulates what would have
happened under normal circumstances, i.e. the thread would have
been interrupted by roundrobin() two separate times.

> Same question for clockintr_stagger().

clockintr_stagger() adjusts the starting offset for the given clock
interrupt.  We use it to keep identical clock interrupts from expiring
simultaneously across every CPU in the system.

> Can we get rid of `hardclock_period' and use a variable set to 100ms?
> This should be tested on alpha which has a hz of 1024 but I'd argue this
> is an improvement.

Sure, that's cleaner.  The updated patch below adds a new
"roundrobin_period" variable initialized during clockintr_init().

Index: kern/sched_bsd.c
===
RCS file: /cvs/src/sys/kern/sched_bsd.c,v
retrieving revision 1.79
diff -u -p -r1.79 sched_bsd.c
--- kern/sched_bsd.c5 Aug 2023 20:07:55 -   1.79
+++ kern/sched_bsd.c10 Aug 2023 17:15:53 -
@@ -54,9 +54,8 @@
 #include 
 #endif
 
-
+uint32_t roundrobin_period;/* [I] roundrobin period (ns) */
 intlbolt;  /* once a second sleep address */
-intrrticks_init;   /* # of hardclock ticks per roundrobin() */
 
 #ifdef MULTIPROCESSOR
 struct __mp_lock sched_lock;
@@ -69,21 +68,23 @@ uint32_tdecay_aftersleep(uint32_t, uin
  * Force switch among equal priority processes every 100ms.
  */
 void
-roundrobin(struct cpu_info *ci)
+roundrobin(struct clockintr *cl, void *cf)
 {
+   struct cpu_info *ci = curcpu();
struct schedstate_percpu *spc = >ci_schedstate;
+   uint64_t count;
 
-   spc->spc_rrticks = rrticks_init;
+   count = clockintr_advance(cl, roundrobin_period);
 
if (ci->ci_curproc != NULL) {
-   if (spc->spc_schedflags & SPCF_SEENRR) {
+   if (spc->spc_schedflags & SPCF_SEENRR || count >= 2) {
/*
 * The process has already been through a roundrobin
 * without switching and may be hogging the CPU.
 * Indicate that the process should yield.
 */
atomic_setbits_int(>spc_schedflags,
-   SPCF_SHOULDYIE

Re: EVFILT_TIMER add support for different timer precisions NOTE_{,U,N,M}SECONDS

2023-08-08 Thread Scott Cheloha
On Sat, Aug 05, 2023 at 01:33:05AM -0400, A Tammy wrote:
> 
> On 8/5/23 00:49, Scott Cheloha wrote:
> > On Sat, Aug 05, 2023 at 12:17:48AM -0400, aisha wrote:
> >> On 22/09/10 01:53PM, Visa Hankala wrote:
> >>> On Wed, Aug 31, 2022 at 04:48:37PM -0400, aisha wrote:
> >>>> I've added a patch which adds support for NOTE_{,U,M,N}SECONDS for
> >>>> EVFILT_TIMER in the kqueue interface.
> >>> It sort of makes sense to add an option to specify timeouts in
> >>> sub-millisecond precision. It feels complete overengineering to add
> >>> multiple time units on the level of the kernel interface. However,
> >>> it looks that FreeBSD and NetBSD have already done this following
> >>> macOS' lead...
> >>>
> >>>> I've also added the NOTE_ABSTIME but haven't done any actual 
> >>>> implementation
> >>>> there as I am not sure how the `data` field should be interpreted (is it
> >>>> absolute time in seconds since epoch?).
> >>> I think FreeBSD and NetBSD take NOTE_ABSTIME as time since the epoch.
> >>>
> >>> Below is a revised patch that takes into account some corner cases.
> >>> It tries to be API-compatible with FreeBSD and NetBSD. I have adjusted
> >>> the NOTE_{,M,U,N}SECONDS flags so that they are enum-like.
> >>>
> >>> The manual page bits are from NetBSD.
> >>>
> >>> It is quite late to introduce a feature like this within this release
> >>> cycle. Until now, the timer code has ignored the fflags field. There
> >>> might be pieces of software that are careless with struct kevent and
> >>> that would break as a result of this patch. Programs that are widely
> >>> used on different BSDs are probably fine already, though.
> >> 
> >> Sorry, I had forgotten this patch for a long time!!! I've been running 
> >> with this for a while now and it's been working nicely.
> > 
> > Where is this being used in ports?  I think having "one of each" for
> > seconds, milliseconds, microseconds, and nanoseconds is (as visa
> > noted) way, way over-the-top.
> 
> I was using it with a port that I sent out a while ago but never got
> into tree (was before I joined the project) -
> https://marc.info/?l=openbsd-ports=165715874509440=2

If nothing in ports is using this I am squeamish about adding it.
Once we add it, we're stuck maintaining it, warts and all.

If www/workflow were in the tree I could see the upside.  Is it in
ports?

It looks like workflow actually wants timerfd(2) from Linux and is
simulating timerfd(2) with EVFILT_TIMER and NOTE_NSECONDS:

https://github.com/sogou/workflow/blob/80b3dfbad2264bcd79ba37811c66421490e337d2/src/kernel/poller.c#L227

I think timerfd(2) is the superior interface here.  It keeps the POSIX
interval timer semantics without all the signal delivery baggage.  It
also supports multiple clocks and starting a periodic timeout from an
absolute starting time.

So, if the goal is "add www/workflow to ports", adding timerfd(2) might
be the right thing.

> I also agree with it being over the top but that's the way it is in
> net/freebsd, I'm also fine with breaking compatibility and only keeping
> nano, no preferences either way.

Well, if we're going to add it (if), we should add all of it.  The
vast majority of the code is not conversion code: if we add support
for NOTE_NSECONDS, adding support for the other units is trivial, and
there is value in being fully compatible with other implementations.

> > The original EVFILT_TIMER supported only milliseconds, yes.  Given
> > that it debuted in the late 90s, I think that was a bad choice.  But
> > when milliseconds were insufficiently precise, the obvious thing would
> > be to add support for nanoseconds... and then stop.
> >
> > The decision to use the UTC clock with no option to select a different
> > clockid_t for NOTE_ABSTIME is also unfortunate.
> 
> Yes, furthermore this was very unclear as I couldn't find this in the
> man pages for either of net/freebsd.
> 
> > Grumble.
> >
> >> I had an unrelated question inlined.
> >>
> >> [...]
> >>>  static void
> >>> -filt_timer_timeout_add(struct knote *kn)
> >>> +filt_timeradd(struct knote *kn, struct timespec *ts)
> >>>  {
> >>> - struct timeval tv;
> >>> + struct timespec expiry, now;
> >>>   struct timeout *to = kn->kn_hook;
> >>>   int tticks;
> >>>  
> >>> - tv.tv_sec = kn->kn_sdata / 1000;
> >>> - tv.tv_usec = (kn->kn_sdata % 1000) * 1000;
&g

agtimer(4/arm64): simplify agtimer_delay()

2023-08-07 Thread Scott Cheloha
The agtimer(4/arm64) delay(9) implementation is quite complicated.
This patch simplifies it.

Am I missing something here?  There's no reason to implement the
busy-loop like this, right?

ok?

Index: agtimer.c
===
RCS file: /cvs/src/sys/arch/arm64/dev/agtimer.c,v
retrieving revision 1.23
diff -u -p -r1.23 agtimer.c
--- agtimer.c   25 Jul 2023 18:16:19 -  1.23
+++ agtimer.c   8 Aug 2023 02:24:57 -
@@ -323,32 +323,12 @@ agtimer_cpu_initclocks(void)
 void
 agtimer_delay(u_int usecs)
 {
-   uint64_tclock, oclock, delta, delaycnt;
-   uint64_tcsec, usec;
-   volatile intj;
+   uint64_t cycles, start;
 
-   if (usecs > (0x8000 / agtimer_frequency)) {
-   csec = usecs / 1;
-   usec = usecs % 1;
-
-   delaycnt = (agtimer_frequency / 100) * csec +
-   (agtimer_frequency / 100) * usec / 1;
-   } else {
-   delaycnt = agtimer_frequency * usecs / 100;
-   }
-   if (delaycnt <= 1)
-   for (j = 100; j > 0; j--)
-   ;
-
-   oclock = agtimer_readcnt64();
-   while (1) {
-   for (j = 100; j > 0; j--)
-   ;
-   clock = agtimer_readcnt64();
-   delta = clock - oclock;
-   if (delta > delaycnt)
-   break;
-   }
+   start = agtimer_readcnt64();
+   cycles = (uint64_t)usecs * agtimer_frequency / 100;
+   while (agtimer_readcnt64() - start < cycles)
+   CPU_BUSY_CYCLE();
 }
 
 void



hardclock(9), roundrobin: make roundrobin() an independent clock interrupt

2023-08-05 Thread Scott Cheloha
This is the next piece of the clock interrupt reorganization patch
series.

This patch removes the roundrobin() call from hardclock() and makes
roundrobin() an independent clock interrupt.

- Revise roundrobin() to make it a valid clock interrupt callback.
  It remains periodic.  It still runs at one tenth of the hardclock
  frequency.

- Account for multiple expirations in roundrobin().  If two or more
  intervals have elapsed we set SPCF_SHOULDYIELD immediately.

  This preserves existing behavior: hardclock() is called multiple
  times during clockintr_hardclock() if clock interrupts are blocked
  for long enough.

- Each schedstate_percpu has its own roundrobin() handle, spc_roundrobin.
  spc_roundrobin is established during sched_init_cpu(), staggered during
  the first clockintr_cpu_init() call, and advanced during clockintr_cpu_init().
  Expirations during suspend/resume are discarded.

- spc_rrticks and rrticks_init are now useless.  Delete them.

ok?

Also, yes, I see the growing pile of scheduler-controlled clock
interrupt handles.  My current plan is to move the setup code at the
end of clockintr_cpu_init() to a different routine, maybe something
like "sched_start_cpu()".  On the primary CPU you'd call it immediately
after cpu_initclocks().  On secondary CPUs you'd call it at the end of
cpu_hatch() just before cpu_switchto().

In any case, we will need to find a home for that code someplace.  It
can't stay in clockintr_cpu_init() forever.

Index: kern/sched_bsd.c
===
RCS file: /cvs/src/sys/kern/sched_bsd.c,v
retrieving revision 1.79
diff -u -p -r1.79 sched_bsd.c
--- kern/sched_bsd.c5 Aug 2023 20:07:55 -   1.79
+++ kern/sched_bsd.c5 Aug 2023 22:15:25 -
@@ -56,7 +56,6 @@
 
 
 intlbolt;  /* once a second sleep address */
-intrrticks_init;   /* # of hardclock ticks per roundrobin() */
 
 #ifdef MULTIPROCESSOR
 struct __mp_lock sched_lock;
@@ -69,21 +68,23 @@ uint32_tdecay_aftersleep(uint32_t, uin
  * Force switch among equal priority processes every 100ms.
  */
 void
-roundrobin(struct cpu_info *ci)
+roundrobin(struct clockintr *cl, void *cf)
 {
+   struct cpu_info *ci = curcpu();
struct schedstate_percpu *spc = >ci_schedstate;
+   uint64_t count;
 
-   spc->spc_rrticks = rrticks_init;
+   count = clockintr_advance(cl, hardclock_period * 10);
 
if (ci->ci_curproc != NULL) {
-   if (spc->spc_schedflags & SPCF_SEENRR) {
+   if (spc->spc_schedflags & SPCF_SEENRR || count >= 2) {
/*
 * The process has already been through a roundrobin
 * without switching and may be hogging the CPU.
 * Indicate that the process should yield.
 */
atomic_setbits_int(>spc_schedflags,
-   SPCF_SHOULDYIELD);
+   SPCF_SEENRR | SPCF_SHOULDYIELD);
} else {
atomic_setbits_int(>spc_schedflags,
SPCF_SEENRR);
@@ -695,8 +696,6 @@ scheduler_start(void)
 * its job.
 */
timeout_set(_to, schedcpu, _to);
-
-   rrticks_init = hz / 10;
schedcpu(_to);
 
 #ifndef SMALL_KERNEL
Index: kern/kern_sched.c
===
RCS file: /cvs/src/sys/kern/kern_sched.c,v
retrieving revision 1.84
diff -u -p -r1.84 kern_sched.c
--- kern/kern_sched.c   5 Aug 2023 20:07:55 -   1.84
+++ kern/kern_sched.c   5 Aug 2023 22:15:25 -
@@ -102,6 +102,12 @@ sched_init_cpu(struct cpu_info *ci)
if (spc->spc_profclock == NULL)
panic("%s: clockintr_establish profclock", __func__);
}
+   if (spc->spc_roundrobin == NULL) {
+   spc->spc_roundrobin = clockintr_establish(>ci_queue,
+   roundrobin);
+   if (spc->spc_roundrobin == NULL)
+   panic("%s: clockintr_establish roundrobin", __func__);
+   }
 
kthread_create_deferred(sched_kthreads_create, ci);
 
Index: kern/kern_clockintr.c
===
RCS file: /cvs/src/sys/kern/kern_clockintr.c,v
retrieving revision 1.30
diff -u -p -r1.30 kern_clockintr.c
--- kern/kern_clockintr.c   5 Aug 2023 20:07:55 -   1.30
+++ kern/kern_clockintr.c   5 Aug 2023 22:15:25 -
@@ -204,6 +204,11 @@ clockintr_cpu_init(const struct intrcloc
clockintr_stagger(spc->spc_profclock, profclock_period,
multiplier, MAXCPUS);
}
+   if (spc->spc_roundrobin->cl_expiration == 0) {
+   clockintr_stagger(spc->spc_roundrobin, hardclock_period,
+   multiplier, MAXCPUS);
+   }
+   clockintr_advance(spc->spc_roundrobin, hardclock_period * 10);
 
if 

Re: EVFILT_TIMER add support for different timer precisions NOTE_{,U,N,M}SECONDS

2023-08-04 Thread Scott Cheloha
On Sat, Aug 05, 2023 at 12:17:48AM -0400, aisha wrote:
> On 22/09/10 01:53PM, Visa Hankala wrote:
> > On Wed, Aug 31, 2022 at 04:48:37PM -0400, aisha wrote:
> > > I've added a patch which adds support for NOTE_{,U,M,N}SECONDS for
> > > EVFILT_TIMER in the kqueue interface.
> > 
> > It sort of makes sense to add an option to specify timeouts in
> > sub-millisecond precision. It feels complete overengineering to add
> > multiple time units on the level of the kernel interface. However,
> > it looks that FreeBSD and NetBSD have already done this following
> > macOS' lead...
> > 
> > > I've also added the NOTE_ABSTIME but haven't done any actual 
> > > implementation
> > > there as I am not sure how the `data` field should be interpreted (is it
> > > absolute time in seconds since epoch?).
> > 
> > I think FreeBSD and NetBSD take NOTE_ABSTIME as time since the epoch.
> > 
> > Below is a revised patch that takes into account some corner cases.
> > It tries to be API-compatible with FreeBSD and NetBSD. I have adjusted
> > the NOTE_{,M,U,N}SECONDS flags so that they are enum-like.
> > 
> > The manual page bits are from NetBSD.
> > 
> > It is quite late to introduce a feature like this within this release
> > cycle. Until now, the timer code has ignored the fflags field. There
> > might be pieces of software that are careless with struct kevent and
> > that would break as a result of this patch. Programs that are widely
> > used on different BSDs are probably fine already, though.
> 
> Sorry, I had forgotten this patch for a long time!!! I've been running with 
> this for a while now and it's been working nicely.

Where is this being used in ports?  I think having "one of each" for
seconds, milliseconds, microseconds, and nanoseconds is (as visa
noted) way, way over-the-top.

The original EVFILT_TIMER supported only milliseconds, yes.  Given
that it debuted in the late 90s, I think that was a bad choice.  But
when milliseconds were insufficiently precise, the obvious thing would
be to add support for nanoseconds... and then stop.

The decision to use the UTC clock with no option to select a different
clockid_t for NOTE_ABSTIME is also unfortunate.

Grumble.

> I had an unrelated question inlined.
> 
> [...]
> > 
> >  static void
> > -filt_timer_timeout_add(struct knote *kn)
> > +filt_timeradd(struct knote *kn, struct timespec *ts)
> >  {
> > -   struct timeval tv;
> > +   struct timespec expiry, now;
> > struct timeout *to = kn->kn_hook;
> > int tticks;
> >  
> > -   tv.tv_sec = kn->kn_sdata / 1000;
> > -   tv.tv_usec = (kn->kn_sdata % 1000) * 1000;
> > -   tticks = tvtohz();
> > -   /* Remove extra tick from tvtohz() if timeout has fired before. */
> > +   if (kn->kn_sfflags & NOTE_ABSTIME) {
> > +   nanotime();
> > +   if (timespeccmp(ts, , >)) {
> > +   timespecsub(ts, , );
> > +   /* XXX timeout_at_ts */
> > +   timeout_add(to, tstohz());

visa:

we should use timeout_abs_ts() here.  I need to adjust it, though.

> > +   } else {
> > +   /* Expire immediately. */
> > +   filt_timerexpire(kn);
> > +   }
> > +   return;
> > +   }
> > +
> > +   tticks = tstohz(ts);
> > +   /* Remove extra tick from tstohz() if timeout has fired before. */
> > if (timeout_triggered(to))
> > tticks--;
> 
> I always wondered why one tick was removed, is one tick really
> that important? And does a timeout firing only cost one tick?

When you convert a duration to a count of ticks with tstohz(), it adds
an extra tick to the result to keep you from undershooting your
timeout.  You start counting your timeout at the start of the *next*
tick, otherwise the timeout might fire early.  However, after the
timeout has expired once, you no longer need the extra tick because
you can (more or less) assume that the timeout is running at the start
of the new tick.

I know that sounds a little fuzzy, but in practice it works.



Re: uvm_loadav: don't recompute schedstate_percpu.spc_nrun

2023-08-03 Thread Scott Cheloha
On Thu, Aug 03, 2023 at 09:09:30AM -0600, Theo de Raadt wrote:
> Scott Cheloha  wrote:
> 
> > > > How about this. Kill the spc_ldavg calculation. Its use is more then
> > > > questionable. The cpu selection code using this is not wroking well and
> > > > process stealing will do the rest.
> > 
> > This is more or less what I said yesterday.  The per-CPU load average
> > is not useful for deciding where to put a thread.
> 
> I guess you have not been reading mpi's scheduler diff.  The entire idea
> of "placing a thread" is 1980's single-processor flawed.

Dude, I'm not talking about mpi's patch, I'm talking about what's in
the tree.

Given the current state of the scheduler, we can throw out spc_ldavg.
It isn't necessary.

> > Of the variables we
> > have available to consider, only the current length of the runqueue is
> > useful.
> 
> No, that concept is also broken.

Again, I am talking about the current scheduler.

Said another way: the current approach can limp along just fine using
only the runqueue length.  We can get rid of spc_ldavg without
worrying about it.



Re: uvm_loadav: don't recompute schedstate_percpu.spc_nrun

2023-08-03 Thread Scott Cheloha
On Thu, Aug 03, 2023 at 02:38:11PM +0200, Mark Kettenis wrote:
> > Date: Thu, 3 Aug 2023 12:56:01 +0200
> > From: Claudio Jeker 
> > 
> > On Thu, Aug 03, 2023 at 10:53:24AM +0200, Claudio Jeker wrote:
> > > On Thu, Aug 03, 2023 at 10:13:57AM +0200, Martin Pieuchot wrote:
> > > > On 02/08/23(Wed) 14:22, Claudio Jeker wrote:
> > > > > On Mon, Jul 31, 2023 at 10:21:11AM -0500, Scott Cheloha wrote:
> > > > > > On Fri, Jul 28, 2023 at 07:36:41PM -0500, Scott Cheloha wrote:
> > > > > > > claudio@ notes that uvm_loadav() pointlessly walks the allproc 
> > > > > > > list to
> > > > > > > recompute schedstate_percpu.spn_nrun for each CPU.
> > > > > > > 
> > > > > > > We can just use the value instead of recomputing it.
> > > > > > 
> > > > > > Whoops, off-by-one.  The current load averaging code includes the
> > > > > > running thread in the nrun count if it is *not* the idle thread.
> > > > > 
> > > > > Yes, with this the loadavg seems to be consistent and following the 
> > > > > number
> > > > > of running processes. The code seems to behave like before (with all 
> > > > > its
> > > > > quirks).
> > > > > 
> > > > > OK claudio@, this is a good first step. Now I think this code should 
> > > > > later
> > > > > be moved into kern_sched.c or sched_bsd.c and removed from uvm. Not 
> > > > > sure why
> > > > > the load calculation is part of memory management...
> > > > > 
> > > > > On top of this I wonder about the per-CPU load calculation. In my 
> > > > > opinion
> > > > > it is wrong to skip the calculation if the CPU is idle. Because of 
> > > > > this
> > > > > there is no decay for idle CPUs and that feels wrong to me.
> > > > > Do we have a userland utility that reports spc_ldavg?
> > > > 
> > > > I don't understand why the SCHED_LOCK() is needed.  Since I'm really
> > > > against adding new uses for it, could you comment on that?
> > > 
> > > The question is how sloppy do we want to be. This code looks at
> > > ci_schedstate (spc_idleproc and spc_nrun) and ci_curproc so the be correct
> > > this needs to lock the scheduler. Do we really want that, hell no.
> >   
> > How about this. Kill the spc_ldavg calculation. Its use is more then
> > questionable. The cpu selection code using this is not wroking well and
> > process stealing will do the rest.

This is more or less what I said yesterday.  The per-CPU load average
is not useful for deciding where to put a thread.  Of the variables we
have available to consider, only the current length of the runqueue is
useful.

Go for it, kill it.

One nit below.

> > Also use sched_cpu_idle to know if a cpu is idle.

(This is a neat trick, nice.)

> > Index: kern/kern_sched.c
> > ===
> > RCS file: /cvs/src/sys/kern/kern_sched.c,v
> > retrieving revision 1.81
> > diff -u -p -r1.81 kern_sched.c
> > --- kern/kern_sched.c   27 Jul 2023 17:52:53 -  1.81
> > +++ kern/kern_sched.c   3 Aug 2023 08:41:38 -
> > @@ -373,7 +373,6 @@ sched_choosecpu_fork(struct proc *parent
> >  {
> >  #ifdef MULTIPROCESSOR
> > struct cpu_info *choice = NULL;
> > -   fixpt_t load, best_load = ~0;
> > int run, best_run = INT_MAX;
> > struct cpu_info *ci;
> > struct cpuset set;
> > @@ -407,13 +406,10 @@ sched_choosecpu_fork(struct proc *parent
> > while ((ci = cpuset_first()) != NULL) {
> > cpuset_del(, ci);
> >  
> > -   load = ci->ci_schedstate.spc_ldavg;
> > run = ci->ci_schedstate.spc_nrun;
> >  
> > -   if (choice == NULL || run < best_run ||
> > -   (run == best_run & < best_load)) {
> > +   if (choice == NULL || run < best_run) {
> > choice = ci;
> > -   best_load = load;
> > best_run = run;
> > }
> > }
> > @@ -605,11 +601,6 @@ sched_proc_to_cpu_cost(struct cpu_info *
> >  */
> > if (CPU_IS_PRIMARY(ci))
> > cost += sched_cost_runnable;
> > -
> > -   /*
> > -* Higher load on the destination means we don't want to go there.
> > -*/
> > -

Re: hardclock: move setitimer(2) code into itimer_update()

2023-08-02 Thread Scott Cheloha
On Thu, Jul 27, 2023 at 10:45:50PM -0500, Scott Cheloha wrote:
> On Wed, Jul 26, 2023 at 11:16:19AM -0500, Scott Cheloha wrote:
> > This is the next patch in the clock interrupt reorganization series.
> > 
> > Now that statclock() is cleaned up we can turn to hardclock().
> > 
> > [...]
> > 
> > This patch moves the setitimer(2) code out of hardclock().  The big
> > idea is identical to what we did with profil(2)/profclock in the
> > profclock/gmonclock patch.
> > 
> > - Move the setitimer(2) polling code from hardclock() to a new clock
> >   interrupt routine, itimer_update(), in kern_time.c.  itimer_update()
> >   is periodic and runs at the same frequency as the hardclock.
> > 
> > - Each schedstate_percpu has its own itimer_update() handle, spc_itimer,
> >   initialized during sched_init_cpu().
> > 
> > - The itimer_update() on a given CPU is enabled/disabled in
> >   mi_switch()/sched_exit() if the running thread's process has enabled
> >   ITIMER_VIRTUAL/ITIMER_PROF.  A new scheduler flag, SPCF_ITIMER,
> >   signifies whether itimer_update() was started and needs stopping.
> > 
> > - A new per-process flag, PS_ITIMER, signifies whether any virtual
> >   interval timers are running.  The flag is updated from the helper
> >   routine process_reset_itimer_flag().  We use it during mi_switch()
> >   to decide whether to start itimer_update() without entering itimer_mtx.
> > 
> > - In setitimer(), call need_resched() when the process changes the
> >   state of ITIMER_VIRTUAL/ITIMER_PROF to force itimer_update() on/off.
> > 
> > regress/sys/kern/itimer passes.
> 
> Updated patch:
> 
> - Rebase on kern_clockintr.c,v1.29 and kern_sched.c,v1.81
> 
> - Stagger spc_itimer in clockintr_cpu_init() alongside spc_profclock
>   until I can figure out where else to do it

Ping.

Index: kern/kern_clock.c
===
RCS file: /cvs/src/sys/kern/kern_clock.c,v
retrieving revision 1.110
diff -u -p -r1.110 kern_clock.c
--- kern/kern_clock.c   1 Aug 2023 07:57:55 -   1.110
+++ kern/kern_clock.c   2 Aug 2023 23:32:39 -
@@ -106,41 +106,12 @@ initclocks(void)
 }
 
 /*
- * hardclock does the accounting needed for ITIMER_PROF and ITIMER_VIRTUAL.
- * We don't want to send signals with psignal from hardclock because it makes
- * MULTIPROCESSOR locking very complicated. Instead, to use an idea from
- * FreeBSD, we set a flag on the thread and when it goes to return to
- * userspace it signals itself.
- */
-
-/*
  * The real-time timer, interrupting hz times per second.
  */
 void
 hardclock(struct clockframe *frame)
 {
-   struct proc *p;
struct cpu_info *ci = curcpu();
-
-   p = curproc;
-   if (p && ((p->p_flag & (P_SYSTEM | P_WEXIT)) == 0)) {
-   struct process *pr = p->p_p;
-
-   /*
-* Run current process's virtual and profile time, as needed.
-*/
-   if (CLKF_USERMODE(frame) &&
-   timespecisset(>ps_timer[ITIMER_VIRTUAL].it_value) &&
-   itimerdecr(>ps_timer[ITIMER_VIRTUAL], tick_nsec) == 0) {
-   atomic_setbits_int(>p_flag, P_ALRMPEND);
-   need_proftick(p);
-   }
-   if (timespecisset(>ps_timer[ITIMER_PROF].it_value) &&
-   itimerdecr(>ps_timer[ITIMER_PROF], tick_nsec) == 0) {
-   atomic_setbits_int(>p_flag, P_PROFPEND);
-   need_proftick(p);
-   }
-   }
 
if (--ci->ci_schedstate.spc_rrticks <= 0)
roundrobin(ci);
Index: kern/kern_time.c
===
RCS file: /cvs/src/sys/kern/kern_time.c,v
retrieving revision 1.163
diff -u -p -r1.163 kern_time.c
--- kern/kern_time.c15 Feb 2023 10:07:50 -  1.163
+++ kern/kern_time.c2 Aug 2023 23:32:39 -
@@ -35,6 +35,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -52,6 +53,7 @@
 #include 
 
 int itimerfix(struct itimerval *);
+void process_reset_itimer_flag(struct process *);
 
 /* 
  * Time of day and interval timer support.
@@ -551,6 +553,10 @@ setitimer(int which, const struct itimer
timeout_del(>ps_realit_to);
}
*itimer = its;
+   if (which == ITIMER_VIRTUAL || which == ITIMER_PROF) {
+   process_reset_itimer_flag(pr);
+   need_resched(curcpu());
+   }
}
 
if (which == ITIMER_REAL)
@@ -729,47 +735,70 @@ itimerfix(struct itimerval *itv)
 }
 
 /*
- * Decrement an interval timer by the given number of nanoseconds.
+ * 

[v2]: uvm_meter, schedcpu: make uvm_meter() an independent timeout

2023-08-02 Thread Scott Cheloha
Now that the proc0 wakeup(9) is gone we can retry the other part of
the uvm_meter() patch.

uvm_meter() is meant to run every 5 seconds, but for historical
reasons it is called from schedcpu() and it is scheduled against the
UTC clock.  schedcpu() and uvm_meter() have different periods, so
uvm_meter() ought to be a separate timeout.  uvm_meter() is started
alongside schedcpu() so the two will still run in sync.

v1: https://marc.info/?l=openbsd-tech=168710929409153=2

ok?

Index: sys/uvm/uvm_meter.c
===
RCS file: /cvs/src/sys/uvm/uvm_meter.c,v
retrieving revision 1.46
diff -u -p -r1.46 uvm_meter.c
--- sys/uvm/uvm_meter.c 2 Aug 2023 13:54:45 -   1.46
+++ sys/uvm/uvm_meter.c 2 Aug 2023 15:13:49 -
@@ -85,10 +85,12 @@ void uvmexp_read(struct uvmexp *);
  * uvm_meter: calculate load average
  */
 void
-uvm_meter(void)
+uvm_meter(void *unused)
 {
-   if ((gettime() % 5) == 0)
-   uvm_loadav();
+   static struct timeout to = TIMEOUT_INITIALIZER(uvm_meter, NULL);
+
+   timeout_add_sec(, 5);
+   uvm_loadav();
 }
 
 /*
Index: sys/uvm/uvm_extern.h
===
RCS file: /cvs/src/sys/uvm/uvm_extern.h,v
retrieving revision 1.170
diff -u -p -r1.170 uvm_extern.h
--- sys/uvm/uvm_extern.h21 Jun 2023 21:16:21 -  1.170
+++ sys/uvm/uvm_extern.h2 Aug 2023 15:13:49 -
@@ -414,7 +414,7 @@ voiduvmspace_free(struct vmspace *);
 struct vmspace *uvmspace_share(struct process *);
 intuvm_share(vm_map_t, vaddr_t, vm_prot_t,
vm_map_t, vaddr_t, vsize_t);
-void   uvm_meter(void);
+void   uvm_meter(void *);
 intuvm_sysctl(int *, u_int, void *, size_t *, 
void *, size_t, struct proc *);
 struct vm_page *uvm_pagealloc(struct uvm_object *,
Index: sys/kern/sched_bsd.c
===
RCS file: /cvs/src/sys/kern/sched_bsd.c,v
retrieving revision 1.78
diff -u -p -r1.78 sched_bsd.c
--- sys/kern/sched_bsd.c25 Jul 2023 18:16:19 -  1.78
+++ sys/kern/sched_bsd.c2 Aug 2023 15:13:50 -
@@ -235,7 +235,6 @@ schedcpu(void *arg)
}
SCHED_UNLOCK(s);
}
-   uvm_meter();
wakeup();
timeout_add_sec(to, 1);
 }
@@ -688,6 +687,7 @@ scheduler_start(void)
 
rrticks_init = hz / 10;
schedcpu(_to);
+   uvm_meter(NULL);
 
 #ifndef SMALL_KERNEL
if (perfpolicy == PERFPOL_AUTO)
Index: share/man/man9/uvm_init.9
===
RCS file: /cvs/src/share/man/man9/uvm_init.9,v
retrieving revision 1.7
diff -u -p -r1.7 uvm_init.9
--- share/man/man9/uvm_init.9   21 Jun 2023 21:16:21 -  1.7
+++ share/man/man9/uvm_init.9   2 Aug 2023 15:13:50 -
@@ -168,7 +168,7 @@ argument is ignored.
 .Ft void
 .Fn uvm_kernacc "caddr_t addr" "size_t len" "int rw"
 .Ft void
-.Fn uvm_meter
+.Fn uvm_meter "void *arg"
 .Ft int
 .Fn uvm_sysctl "int *name" "u_int namelen" "void *oldp" "size_t *oldlenp" 
"void *newp " "size_t newlen" "struct proc *p"
 .Ft int
@@ -212,7 +212,7 @@ access, in the kernel address space.
 .Pp
 The
 .Fn uvm_meter
-function calculates the load average and wakes up the swapper if necessary.
+timeout updates system load averages every five seconds.
 .Pp
 The
 .Fn uvm_sysctl



Re: uvm_meter: remove wakeup of proc0

2023-07-31 Thread Scott Cheloha
On Mon, Jul 31, 2023 at 10:04:44PM +0200, Claudio Jeker wrote:
> On Mon, Jul 31, 2023 at 09:49:30PM +0200, Claudio Jeker wrote:
> > On Mon, Jul 31, 2023 at 08:03:41PM +0300, Vitaliy Makkoveev wrote:
> > > This is the culprit:
> > > 
> > > schedule_timeout_uninterruptible(long timeout)
> > > {
> > > tsleep(curproc, PWAIT, "schtou", timeout);
> > > return 0;
> > > }
> > > 
> > 
> > Please give this a try. I think on initialization
> > intel_dp_wait_source_oui() is called before intel_dp->last_oui_write is
> > set and this results in a 10min timeout because our jiffies are set to
> > ULONG_MAX - (10 * 60 * HZ);
> 
> After talking with kettenis@ I think the following diff is better.
> Starting with 0 jiffies should fix this issue.
> Unless we want to do the linux madness and set it to
>   ((unsigned long)(unsigned int) (-300*HZ))
> 
> -- 
> :wq Claudio
> 
> Index: kern_clock.c
> ===
> RCS file: /cvs/src/sys/kern/kern_clock.c,v
> retrieving revision 1.109
> diff -u -p -r1.109 kern_clock.c
> --- kern_clock.c  25 Jul 2023 18:16:19 -  1.109
> +++ kern_clock.c  31 Jul 2023 20:01:57 -
> @@ -84,7 +84,7 @@ int profhz;
>  int  profprocs;
>  int  ticks = INT_MAX - (15 * 60 * HZ);
>  
> -volatile unsigned long jiffies = ULONG_MAX - (10 * 60 * HZ);
> +volatile unsigned long jiffies;
>  
>  /*
>   * Initialize clock frequencies and start both clocks running.
> 

I think this is backwards.

Why are we changing the initial value of jiffies (wide) to resolve a
problem with the initialization of one struct (narrow)?  Changing the
initial value of jiffies just masks the root cause.

Isn't the right thing here to initialize the last-write timestamp when
the struct is allocated?



Re: uvm_loadav: don't recompute schedstate_percpu.spc_nrun

2023-07-31 Thread Scott Cheloha
On Fri, Jul 28, 2023 at 07:36:41PM -0500, Scott Cheloha wrote:
> claudio@ notes that uvm_loadav() pointlessly walks the allproc list to
> recompute schedstate_percpu.spn_nrun for each CPU.
> 
> We can just use the value instead of recomputing it.

Whoops, off-by-one.  The current load averaging code includes the
running thread in the nrun count if it is *not* the idle thread.

Index: uvm_meter.c
===
RCS file: /cvs/src/sys/uvm/uvm_meter.c,v
retrieving revision 1.44
diff -u -p -r1.44 uvm_meter.c
--- uvm_meter.c 21 Jun 2023 21:16:21 -  1.44
+++ uvm_meter.c 31 Jul 2023 15:20:37 -
@@ -102,43 +102,29 @@ uvm_loadav(struct loadavg *avg)
 {
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
-   int i, nrun;
-   struct proc *p;
-   int nrun_cpu[MAXCPUS];
+   struct schedstate_percpu *spc;
+   u_int i, nrun = 0, nrun_cpu;
+   int s;
 
-   nrun = 0;
-   memset(nrun_cpu, 0, sizeof(nrun_cpu));
 
-   LIST_FOREACH(p, , p_list) {
-   switch (p->p_stat) {
-   case SSTOP:
-   case SSLEEP:
-   break;
-   case SRUN:
-   case SONPROC:
-   if (p == p->p_cpu->ci_schedstate.spc_idleproc)
-   continue;
-   /* FALLTHROUGH */
-   case SIDL:
-   nrun++;
-   if (p->p_cpu)
-   nrun_cpu[CPU_INFO_UNIT(p->p_cpu)]++;
-   }
+   SCHED_LOCK(s);
+   CPU_INFO_FOREACH(cii, ci) {
+   spc = >ci_schedstate;
+   nrun_cpu = spc->spc_nrun;
+   if (ci->ci_curproc != spc->spc_idleproc)
+   nrun_cpu++;
+   if (nrun_cpu == 0)
+   continue;
+   spc->spc_ldavg = (cexp[0] * spc->spc_ldavg +
+   nrun_cpu * FSCALE *
+   (FSCALE - cexp[0])) >> FSHIFT;
+   nrun += nrun_cpu;
}
+   SCHED_UNLOCK(s);
 
for (i = 0; i < 3; i++) {
avg->ldavg[i] = (cexp[i] * avg->ldavg[i] +
nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT;
-   }
-
-   CPU_INFO_FOREACH(cii, ci) {
-   struct schedstate_percpu *spc = >ci_schedstate;
-
-   if (nrun_cpu[CPU_INFO_UNIT(ci)] == 0)
-   continue;
-   spc->spc_ldavg = (cexp[0] * spc->spc_ldavg +
-   nrun_cpu[CPU_INFO_UNIT(ci)] * FSCALE *
-   (FSCALE - cexp[0])) >> FSHIFT;
}
 }
 



ualarm.3: cleanup, rewrites

2023-07-30 Thread Scott Cheloha
This patch drags ualarm.3 over to where alarm.3 is.  I think it reads
better and the wording is truer to what the function actually does.
In particular, ualarm(3) does not block or "wait": the alarm is
scheduled for asynchronous delivery by the operating system.

I think I may have tried to clean this up two years ago.  I don't
remember where that got sidetracked, but fwiw this was written from
scratch using alarm.3 as a guide.

Two things I'm iffy on:

- "high resolution" or "high-resolution"?

- The current manual mentions an upper bound (2147483647).  I'm not
  sure when, if ever, this was the real: useconds_t is unsigned, so an
  upper bound of INT32_MAX seems off.

  I am leaning toward just leaving the patch as-is instead of trying
  to guide the end-user through the minefield of matching bespoke
  "_t" types to real types and limits.

Tweaks?  ok?

Index: ualarm.3
===
RCS file: /cvs/src/lib/libc/gen/ualarm.3,v
retrieving revision 1.17
diff -u -p -r1.17 ualarm.3
--- ualarm.326 Jul 2019 12:08:18 -  1.17
+++ ualarm.331 Jul 2023 01:05:23 -
@@ -32,7 +32,7 @@
 .Os
 .Sh NAME
 .Nm ualarm
-.Nd schedule signal after specified time
+.Nd schedule high resolution SIGALRM delivery
 .Sh SYNOPSIS
 .In unistd.h
 .Ft useconds_t
@@ -45,31 +45,37 @@ This is a simplified interface to
 .Pp
 The
 .Fn ualarm
-function waits a count of
+function schedules the
+.Dv SIGALRM
+signal for delivery to the calling process after at least the given number of
 .Fa microseconds
-before asserting the terminating signal
-.Dv SIGALRM .
-System activity or time used in processing the call may cause a slight
-delay.
-.Pp
-If the
+have elapsed.
+If
 .Fa interval
-argument is non-zero, the
+is non-zero,
+the
 .Dv SIGALRM
-signal will be sent
-to the process every
+signal is scheduled for redelivery to the calling process every
 .Fa interval
-microseconds after the timer expires (e.g., after
+microseconds thereafter.
+.Pp
+If an alarm is already pending,
+an additional call to
+.Fn ualarm
+supersedes the prior call.
+.Pp
+If
 .Fa microseconds
-number of microseconds have passed).
+is zero,
+any pending alarm is cancelled and the value of
+.Fa interval
+is ignored.
 .Sh RETURN VALUES
-When the signal has successfully been caught,
+The
 .Fn ualarm
-returns the amount of time left on the clock.
-The maximum value for
-.Fa microseconds
-allowed
-is 2147483647.
+function returns the number of microseconds remaining until the next
+alarm is scheduled for delivery,
+or zero if no alarm is pending.
 .Sh SEE ALSO
 .Xr setitimer 2 ,
 .Xr sigaction 2 ,
@@ -86,5 +92,17 @@ function conforms to
 .Sh HISTORY
 The
 .Fn ualarm
-function appeared in
+function first appeared in
 .Bx 4.3 .
+.Sh CAVEATS
+The
+.Fn ualarm
+function is implemented with the per-process
+.Dv ITIMER_REAL
+timer described in
+.Xr setitimer 2 .
+Use of both
+.Fn ualarm
+and
+.Xr setitimer 2
+in the same program may yield confusing behavior.



uvm_loadav: don't recompute schedstate_percpu.spc_nrun

2023-07-28 Thread Scott Cheloha
claudio@ notes that uvm_loadav() pointlessly walks the allproc list to
recompute schedstate_percpu.spn_nrun for each CPU.

We can just use the value instead of recomputing it.

ok?

Index: uvm_meter.c
===
RCS file: /cvs/src/sys/uvm/uvm_meter.c,v
retrieving revision 1.44
diff -u -p -r1.44 uvm_meter.c
--- uvm_meter.c 21 Jun 2023 21:16:21 -  1.44
+++ uvm_meter.c 29 Jul 2023 00:31:19 -
@@ -102,43 +102,29 @@ uvm_loadav(struct loadavg *avg)
 {
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
-   int i, nrun;
-   struct proc *p;
-   int nrun_cpu[MAXCPUS];
+   struct schedstate_percpu *spc;
+   u_int i, nrun = 0, nrun_cpu;
+   int s;
 
-   nrun = 0;
-   memset(nrun_cpu, 0, sizeof(nrun_cpu));
 
-   LIST_FOREACH(p, , p_list) {
-   switch (p->p_stat) {
-   case SSTOP:
-   case SSLEEP:
-   break;
-   case SRUN:
-   case SONPROC:
-   if (p == p->p_cpu->ci_schedstate.spc_idleproc)
-   continue;
-   /* FALLTHROUGH */
-   case SIDL:
-   nrun++;
-   if (p->p_cpu)
-   nrun_cpu[CPU_INFO_UNIT(p->p_cpu)]++;
-   }
+   SCHED_LOCK(s);
+   CPU_INFO_FOREACH(cii, ci) {
+   spc = >ci_schedstate;
+   nrun_cpu = spc->spc_nrun;
+   if (ci->ci_curproc == spc->spc_idleproc)
+   nrun_cpu++;
+   if (nrun_cpu == 0)
+   continue;
+   spc->spc_ldavg = (cexp[0] * spc->spc_ldavg +
+   nrun_cpu * FSCALE *
+   (FSCALE - cexp[0])) >> FSHIFT;
+   nrun += nrun_cpu;
}
+   SCHED_UNLOCK(s);
 
for (i = 0; i < 3; i++) {
avg->ldavg[i] = (cexp[i] * avg->ldavg[i] +
nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT;
-   }
-
-   CPU_INFO_FOREACH(cii, ci) {
-   struct schedstate_percpu *spc = >ci_schedstate;
-
-   if (nrun_cpu[CPU_INFO_UNIT(ci)] == 0)
-   continue;
-   spc->spc_ldavg = (cexp[0] * spc->spc_ldavg +
-   nrun_cpu[CPU_INFO_UNIT(ci)] * FSCALE *
-   (FSCALE - cexp[0])) >> FSHIFT;
}
 }
 



Re: hardclock: move setitimer(2) code into itimer_update()

2023-07-27 Thread Scott Cheloha
On Wed, Jul 26, 2023 at 11:16:19AM -0500, Scott Cheloha wrote:
> This is the next patch in the clock interrupt reorganization series.
> 
> Now that statclock() is cleaned up we can turn to hardclock().
> 
> [...]
> 
> This patch moves the setitimer(2) code out of hardclock().  The big
> idea is identical to what we did with profil(2)/profclock in the
> profclock/gmonclock patch.
> 
> - Move the setitimer(2) polling code from hardclock() to a new clock
>   interrupt routine, itimer_update(), in kern_time.c.  itimer_update()
>   is periodic and runs at the same frequency as the hardclock.
> 
> - Each schedstate_percpu has its own itimer_update() handle, spc_itimer,
>   initialized during sched_init_cpu().
> 
> - The itimer_update() on a given CPU is enabled/disabled in
>   mi_switch()/sched_exit() if the running thread's process has enabled
>   ITIMER_VIRTUAL/ITIMER_PROF.  A new scheduler flag, SPCF_ITIMER,
>   signifies whether itimer_update() was started and needs stopping.
> 
> - A new per-process flag, PS_ITIMER, signifies whether any virtual
>   interval timers are running.  The flag is updated from the helper
>   routine process_reset_itimer_flag().  We use it during mi_switch()
>   to decide whether to start itimer_update() without entering itimer_mtx.
> 
> - In setitimer(), call need_resched() when the process changes the
>   state of ITIMER_VIRTUAL/ITIMER_PROF to force itimer_update() on/off.
> 
> regress/sys/kern/itimer passes.

Updated patch:

- Rebase on kern_clockintr.c,v1.29 and kern_sched.c,v1.81

- Stagger spc_itimer in clockintr_cpu_init() alongside spc_profclock
  until I can figure out where else to do it

ok?

Index: kern/kern_clock.c
===
RCS file: /cvs/src/sys/kern/kern_clock.c,v
retrieving revision 1.109
diff -u -p -r1.109 kern_clock.c
--- kern/kern_clock.c   25 Jul 2023 18:16:19 -  1.109
+++ kern/kern_clock.c   28 Jul 2023 03:44:16 -
@@ -105,41 +105,12 @@ initclocks(void)
 }
 
 /*
- * hardclock does the accounting needed for ITIMER_PROF and ITIMER_VIRTUAL.
- * We don't want to send signals with psignal from hardclock because it makes
- * MULTIPROCESSOR locking very complicated. Instead, to use an idea from
- * FreeBSD, we set a flag on the thread and when it goes to return to
- * userspace it signals itself.
- */
-
-/*
  * The real-time timer, interrupting hz times per second.
  */
 void
 hardclock(struct clockframe *frame)
 {
-   struct proc *p;
struct cpu_info *ci = curcpu();
-
-   p = curproc;
-   if (p && ((p->p_flag & (P_SYSTEM | P_WEXIT)) == 0)) {
-   struct process *pr = p->p_p;
-
-   /*
-* Run current process's virtual and profile time, as needed.
-*/
-   if (CLKF_USERMODE(frame) &&
-   timespecisset(>ps_timer[ITIMER_VIRTUAL].it_value) &&
-   itimerdecr(>ps_timer[ITIMER_VIRTUAL], tick_nsec) == 0) {
-   atomic_setbits_int(>p_flag, P_ALRMPEND);
-   need_proftick(p);
-   }
-   if (timespecisset(>ps_timer[ITIMER_PROF].it_value) &&
-   itimerdecr(>ps_timer[ITIMER_PROF], tick_nsec) == 0) {
-   atomic_setbits_int(>p_flag, P_PROFPEND);
-   need_proftick(p);
-   }
-   }
 
if (--ci->ci_schedstate.spc_rrticks <= 0)
roundrobin(ci);
Index: kern/kern_time.c
===
RCS file: /cvs/src/sys/kern/kern_time.c,v
retrieving revision 1.163
diff -u -p -r1.163 kern_time.c
--- kern/kern_time.c15 Feb 2023 10:07:50 -  1.163
+++ kern/kern_time.c28 Jul 2023 03:44:16 -
@@ -35,6 +35,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -52,6 +53,7 @@
 #include 
 
 int itimerfix(struct itimerval *);
+void process_reset_itimer_flag(struct process *);
 
 /* 
  * Time of day and interval timer support.
@@ -551,6 +553,10 @@ setitimer(int which, const struct itimer
timeout_del(>ps_realit_to);
}
*itimer = its;
+   if (which == ITIMER_VIRTUAL || which == ITIMER_PROF) {
+   process_reset_itimer_flag(pr);
+   need_resched(curcpu());
+   }
}
 
if (which == ITIMER_REAL)
@@ -729,47 +735,70 @@ itimerfix(struct itimerval *itv)
 }
 
 /*
- * Decrement an interval timer by the given number of nanoseconds.
+ * Decrement an interval timer by the given duration.
  * If the timer expires and it is periodic then reload it.  When reloading
  * the timer we subtract any overrun from the next period so that the timer
  * does not drift.
  */
 int
-itimerdecr(struct itime

hardclock: move setitimer(2) code into itimer_update()

2023-07-26 Thread Scott Cheloha
This is the next patch in the clock interrupt reorganization series.

Now that statclock() is cleaned up we can turn to hardclock().

The goal of the next four patches is to eliminate the need for the
hardclock on secondary CPUs.  Secondary CPUs don't need a hardclock.
hardclock() is only used on secondary CPUs to poll for events that
rarely happen: setitimer(2) hits, dt(4), and roundrobin().  We can
break all of these out into separate clock interrupt routines.

This patch moves the setitimer(2) code out of hardclock().  The big
idea is identical to what we did with profil(2)/profclock in the
profclock/gmonclock patch.

- Move the setitimer(2) polling code from hardclock() to a new clock
  interrupt routine, itimer_update(), in kern_time.c.  itimer_update()
  is periodic and runs at the same frequency as the hardclock.

- Each schedstate_percpu has its own itimer_update() handle, spc_itimer,
  initialized during sched_init_cpu().

- The itimer_update() on a given CPU is enabled/disabled in
  mi_switch()/sched_exit() if the running thread's process has enabled
  ITIMER_VIRTUAL/ITIMER_PROF.  A new scheduler flag, SPCF_ITIMER,
  signifies whether itimer_update() was started and needs stopping.

- A new per-process flag, PS_ITIMER, signifies whether any virtual
  interval timers are running.  The flag is updated from the helper
  routine process_reset_itimer_flag().  We use it during mi_switch()
  to decide whether to start itimer_update() without entering itimer_mtx.

- In setitimer(), call need_resched() when the process changes the
  state of ITIMER_VIRTUAL/ITIMER_PROF to force itimer_update() on/off.

regress/sys/kern/itimer passes.

ok?

Index: kern/kern_clock.c
===
RCS file: /cvs/src/sys/kern/kern_clock.c,v
retrieving revision 1.109
diff -u -p -r1.109 kern_clock.c
--- kern/kern_clock.c   25 Jul 2023 18:16:19 -  1.109
+++ kern/kern_clock.c   26 Jul 2023 14:41:02 -
@@ -86,6 +86,8 @@ int   ticks = INT_MAX - (15 * 60 * HZ);
 
 volatile unsigned long jiffies = ULONG_MAX - (10 * 60 * HZ);
 
+uint32_t hardclock_period; /* [I] hardclock period (ns) */
+
 /*
  * Initialize clock frequencies and start both clocks running.
  */
@@ -97,6 +99,9 @@ initclocks(void)
 */
cpu_initclocks();
 
+   KASSERT(hz > 0 && hz <= 10);
+   hardclock_period = 10 / hz;
+
KASSERT(profhz >= stathz && profhz <= 10);
KASSERT(profhz % stathz == 0);
profclock_period = 10 / profhz;
@@ -105,41 +110,12 @@ initclocks(void)
 }
 
 /*
- * hardclock does the accounting needed for ITIMER_PROF and ITIMER_VIRTUAL.
- * We don't want to send signals with psignal from hardclock because it makes
- * MULTIPROCESSOR locking very complicated. Instead, to use an idea from
- * FreeBSD, we set a flag on the thread and when it goes to return to
- * userspace it signals itself.
- */
-
-/*
  * The real-time timer, interrupting hz times per second.
  */
 void
 hardclock(struct clockframe *frame)
 {
-   struct proc *p;
struct cpu_info *ci = curcpu();
-
-   p = curproc;
-   if (p && ((p->p_flag & (P_SYSTEM | P_WEXIT)) == 0)) {
-   struct process *pr = p->p_p;
-
-   /*
-* Run current process's virtual and profile time, as needed.
-*/
-   if (CLKF_USERMODE(frame) &&
-   timespecisset(>ps_timer[ITIMER_VIRTUAL].it_value) &&
-   itimerdecr(>ps_timer[ITIMER_VIRTUAL], tick_nsec) == 0) {
-   atomic_setbits_int(>p_flag, P_ALRMPEND);
-   need_proftick(p);
-   }
-   if (timespecisset(>ps_timer[ITIMER_PROF].it_value) &&
-   itimerdecr(>ps_timer[ITIMER_PROF], tick_nsec) == 0) {
-   atomic_setbits_int(>p_flag, P_PROFPEND);
-   need_proftick(p);
-   }
-   }
 
if (--ci->ci_schedstate.spc_rrticks <= 0)
roundrobin(ci);
Index: kern/kern_time.c
===
RCS file: /cvs/src/sys/kern/kern_time.c,v
retrieving revision 1.163
diff -u -p -r1.163 kern_time.c
--- kern/kern_time.c15 Feb 2023 10:07:50 -  1.163
+++ kern/kern_time.c26 Jul 2023 14:41:02 -
@@ -35,6 +35,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -52,6 +53,7 @@
 #include 
 
 int itimerfix(struct itimerval *);
+void process_reset_itimer_flag(struct process *);
 
 /* 
  * Time of day and interval timer support.
@@ -551,6 +553,10 @@ setitimer(int which, const struct itimer
timeout_del(>ps_realit_to);
}
*itimer = its;
+   if (which != ITIMER_REAL) {
+   process_reset_itimer_flag(pr);
+   need_resched(curcpu());
+   }
}
 
if (which == 

Re: option GPROF on riscv64

2023-07-22 Thread Scott Cheloha
On Fri, Jul 21, 2023 at 08:41:32PM +0200, Jeremie Courreges-Anglas wrote:
> 
> Spotted while testing a diff from cheloha@, option GPROF doesn't build
> on riscv64 because MCOUNT_ENTER/MCOUNT_EXIT from
> riscv64/include/profile.h haven't been adapted for riscv64.
> 
> riscv64 /sys/arch/riscv64/compile/GPROF.MP$ doas -u build make
> cc -g -Werror -Wall -Wimplicit-function-declaration  -Wno-pointer-sign  
> -Wno-constant-conversion -Wno-address-of-packed-member  
> -Wno-unused-but-set-variable -Wno-gnu-folding-constant  
> -Wframe-larger-than=2047 -Wno-deprecated-non-prototype 
> -Wno-unknown-warning-option -march=rv64gc -mcmodel=medany -mno-relax  
> -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -ffreestanding -fno-pie 
> -O2  -pipe -nostdinc -I/sys -I/sys/arch/riscv64/compile/GPROF.MP/obj 
> -I/sys/arch  -I/sys/dev/pci/drm/include  -I/sys/dev/pci/drm/include/uapi 
> -DDDB -DDIAGNOSTIC -DKTRACE -DACCOUNTING -DKMEMSTATS -DPTRACE -DPOOL_DEBUG 
> -DCRYPTO -DSYSVMSG -DSYSVSEM -DSYSVSHM -DUVM_SWAP_ENCRYPT -DFFS -DFFS2 
> -DFFS_SOFTUPDATES -DUFS_DIRHASH -DQUOTA -DEXT2FS -DMFS -DNFSCLIENT 
> -DNFSSERVER -DCD9660 -DUDF -DMSDOSFS -DFIFO -DFUSE -DSOCKET_SPLICE -DTCP_ECN 
> -DTCP_SIGNATURE -DINET6 -DIPSEC -DPPP_BSDCOMP -DPPP_DEFLATE -DPIPEX 
> -DMROUTING -DMPLS -DBOOT_CONFIG -DPCIVERBOSE -DUSER_PCICONF 
> -DWSDISPLAY_COMPAT_USL -DWSDISPLAY_COMPAT_RAWKBD 
> -DWSDISPLAY_DEFAULTSCREENS="6" -DMULTIPROCESSOR -DGPROF -DMAXUSERS=80 
> -D_KERNEL -D__riscv64__ -MD -MP -pg -c /sys/kern/subr_prof.c
> cc -g -Werror -Wall -Wimplicit-function-declaration  -Wno-pointer-sign  
> -Wno-constant-conversion -Wno-address-of-packed-member  
> -Wno-unused-but-set-variable -Wno-gnu-folding-constant  
> -Wframe-larger-than=2047 -Wno-deprecated-non-prototype 
> -Wno-unknown-warning-option -march=rv64gc -mcmodel=medany -mno-relax  
> -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -ffreestanding -fno-pie 
> -O2  -pipe -nostdinc -I/sys -I/sys/arch/riscv64/compile/GPROF.MP/obj 
> -I/sys/arch  -I/sys/dev/pci/drm/include  -I/sys/dev/pci/drm/include/uapi 
> -DDDB -DDIAGNOSTIC -DKTRACE -DACCOUNTING -DKMEMSTATS -DPTRACE -DPOOL_DEBUG 
> -DCRYPTO -DSYSVMSG -DSYSVSEM -DSYSVSHM -DUVM_SWAP_ENCRYPT -DFFS -DFFS2 
> -DFFS_SOFTUPDATES -DUFS_DIRHASH -DQUOTA -DEXT2FS -DMFS -DNFSCLIENT 
> -DNFSSERVER -DCD9660 -DUDF -DMSDOSFS -DFIFO -DFUSE -DSOCKET_SPLICE -DTCP_ECN 
> -DTCP_SIGNATURE -DINET6 -DIPSEC -DPPP_BSDCOMP -DPPP_DEFLATE -DPIPEX 
> -DMROUTING -DMPLS -DBOOT_CONFIG -DPCIVERBOSE -DUSER_PCICONF 
> -DWSDISPLAY_COMPAT_USL -DWSDISPLAY_COMPAT_RAWKBD 
> -DWSDISPLAY_DEFAULTSCREENS="6" -DMULTIPROCESSOR -DGPROF -DMAXUSERS=80 
> -D_KERNEL -D__riscv64__ -MD -MP -fno-ret-protector -c 
> /sys/lib/libkern/mcount.c
> /sys/lib/libkern/mcount.c:79:2: error: invalid operand in inline asm: 'mrs 
> ${0:x},daif; msr daifset, #0x2'
> MCOUNT_ENTER;
> ^
> /sys/arch/riscv64/compile/GPROF.MP/obj/machine/profile.h:88:10: note: 
> expanded from macro 'MCOUNT_ENTER'
> __asm__ ("mrs %x0,daif; msr daifset, #0x2": "=r"(s));
>  ^
> /sys/lib/libkern/mcount.c:79:2: error: unknown operand
> /sys/arch/riscv64/compile/GPROF.MP/obj/machine/profile.h:88:10: note: 
> expanded from macro 'MCOUNT_ENTER'
> __asm__ ("mrs %x0,daif; msr daifset, #0x2": "=r"(s));
>  ^
> :1:6: note: instantiated into assembly here
> mrs ,daif; msr daifset, #0x2
> ^
> /sys/lib/libkern/mcount.c:79:2: error: unknown operand
> MCOUNT_ENTER;
> ^
> /sys/arch/riscv64/compile/GPROF.MP/obj/machine/profile.h:88:10: note: 
> expanded from macro 'MCOUNT_ENTER'
> __asm__ ("mrs %x0,daif; msr daifset, #0x2": "=r"(s));
>  ^
> :1:26: note: instantiated into assembly here
> mrs ,daif; msr daifset, #0x2
> ^
> /sys/lib/libkern/mcount.c:171:2: error: invalid operand in inline asm: 'msr 
> daif, ${0:x}'
> MCOUNT_EXIT;
> ^
> /sys/arch/riscv64/compile/GPROF.MP/obj/machine/profile.h:90:10: note: 
> expanded from macro 'MCOUNT_EXIT'
> __asm__ ("msr daif, %x0":: "r"(s));
>  ^
> /sys/lib/libkern/mcount.c:171:2: error: unknown operand
> /sys/arch/riscv64/compile/GPROF.MP/obj/machine/profile.h:90:10: note: 
> expanded from macro 'MCOUNT_EXIT'
> __asm__ ("msr daif, %x0":: "r"(s));
>  ^
> :1:12: note: instantiated into assembly here
> msr daif,
>   ^
> /sys/lib/libkern/mcount.c:171:2: error: invalid operand in inline asm: 'msr 
> daif, ${0:x}'
> MCOUNT_EXIT;
> ^
> /sys/arch/riscv64/compile/GPROF.MP/obj/machine/profile.h:90:10: note: 
> expanded from macro 'MCOUNT_EXIT'
> __asm__ ("msr daif, %x0":: "r"(s));
>  ^
> /sys/lib/libkern/mcount.c:171:2: error: unknown operand
> /sys/arch/riscv64/compile/GPROF.MP/obj/machine/profile.h:90:10: note: 
> expanded from macro 'MCOUNT_EXIT'
> __asm__ ("msr daif, %x0":: "r"(s));
>  ^
> :1:12: note: instantiated into assembly here
> msr daif,
>   ^
> /sys/lib/libkern/mcount.c:171:2: error: 

Re: [v2] statclock: move profil(2), GPROF code into other clock interrupts

2023-07-21 Thread Scott Cheloha
On Fri, Jul 21, 2023 at 08:37:11PM +0200, Jeremie Courreges-Anglas wrote:
> On Fri, Jul 21 2023, Mike Larkin  wrote:
> > On Fri, Jul 21, 2023 at 05:46:32PM +0200, Jeremie Courreges-Anglas wrote:
> >> On Thu, Jul 20 2023, Scott Cheloha  wrote:
> >> > On Wed, Jul 19, 2023 at 05:09:04AM +, Mike Larkin wrote:
> >> >> On Tue, Jul 18, 2023 at 08:21:41AM -0500, Scott Cheloha wrote:
> >> >> > This patch moves the profil(2)- and GPROF-specific parts of
> >> >> > statclock() out into into separate clock interrupt routines.  The
> >> >> > profil(2) part moves into profclock() and is enabled/disabled as
> >> >> > needed during mi_switch().  The GPROF part moves into gmonclock() and
> >> >> > is enabled/disabled as needed via sysctl(2).
> >> >> >
> >> >> > Moving those parts out of statclock() eliminates the need for an
> >> >> > effective statclock frequency and we can delete all the junk related
> >> >> > to that: psratio/psdiv/pscnt and corresponding members of
> >> >> > schedstate_percpu, clockintr_setstatclockrate(), a bunch of other
> >> >> > clockintr-internal code
> >> >> >
> >> >> > In separate commits I have addressed:
> >> >> >
> >> >> > - General GPROF instability on amd64
> >> >> > - GPROF causing a crash during suspend/resume
> >> >> > - CTASSERT breakage on amd64 related to schedstate_percpu
> >> >> >   changes in this patch
> >> >> >
> >> >> > This has been kicking around for over two months.  Personally, I have
> >> >> > tested it on amd64, arm64, macppc, octeon, and sparc64.
> >> >> >
> >> >> > Compile- and boot-tests on other platforms (alpha, i386, luna88k,
> >> >> > riscv64, sh) would be appreciated, but the last time I asked for tests
> >> >> > I got zero reports back.
> >> >>
> >> >> i386 compiles and boots.
> >> >
> >> > Great!
> >> >
> >> >> as reported in separate mail, riscv64 doesn't compile.
> >> >
> >> > I think we're missing a 'struct user' definition on riscv64.  Can you
> >> > try this?
> >>
> >> GENERIC.MP with option GPROF doesn't build on riscv64, but this diff
> >> doesn't introduce any new error.  Runtime untested.
> >>
> >> --
> >> jca | PGP : 0x1524E7EE / 5135 92C1 AD36 5293 2BDF  DDCC 0DFA 74AE 1524 E7EE
> >>
> >
> > Yes, I should have pointed out I did a normal build and not a GPROF build
> > which I have no idea how to test, nor do I use. Same disclaimer applies to
> > i386.
> 
> To test-build this, I merely added:
>   makeoptionsPROF="-pg"
>   option GPROF
> to a GENERIC.MP copy and then booted that kernel.
> 
> Scott, your statclock diff doesn't seem to make the runtime behavior any
> worse on riscv64 (ie it's broken, but maybe my profile.h fix isn't right).

Drat.

As long as the default (no GPROF, no PROF=-pg) kernel compiles, boots,
and is self-hosting, we're fine.

GPROF is a developer-only edge case that I am trying to spruce up.
The code is in the tree, so the code may as well work.  More than
anything, though, GPROF is *in the way* of later, more interesting
clock interrupt patches and I'm trying to get it out of the way and
keep moving forward.

If GPROF is already broken on riscv64 then this patch isn't to blame.
If kgmon(8) did work on riscv64 at some point in the past, it will be
interesting to work out how it got broken.

Uh, can you check whether profil(2) works?  You will need a
pledge(2)-free program to test with.



Re: [v2] statclock: move profil(2), GPROF code into other clock interrupts

2023-07-20 Thread Scott Cheloha
On Thu, Jul 20, 2023 at 07:21:06PM +, Mike Larkin wrote:
> On Thu, Jul 20, 2023 at 01:23:01PM -0500, Scott Cheloha wrote:
> > On Wed, Jul 19, 2023 at 05:09:04AM +, Mike Larkin wrote:
> > > On Tue, Jul 18, 2023 at 08:21:41AM -0500, Scott Cheloha wrote:
> > > > This patch moves the profil(2)- and GPROF-specific parts of
> > > > statclock() out into into separate clock interrupt routines.  The
> > > > profil(2) part moves into profclock() and is enabled/disabled as
> > > > needed during mi_switch().  The GPROF part moves into gmonclock() and
> > > > is enabled/disabled as needed via sysctl(2).
> > > >
> > > > Moving those parts out of statclock() eliminates the need for an
> > > > effective statclock frequency and we can delete all the junk related
> > > > to that: psratio/psdiv/pscnt and corresponding members of
> > > > schedstate_percpu, clockintr_setstatclockrate(), a bunch of other
> > > > clockintr-internal code
> > > >
> > > > In separate commits I have addressed:
> > > >
> > > > - General GPROF instability on amd64
> > > > - GPROF causing a crash during suspend/resume
> > > > - CTASSERT breakage on amd64 related to schedstate_percpu
> > > >   changes in this patch
> > > >
> > > > This has been kicking around for over two months.  Personally, I have
> > > > tested it on amd64, arm64, macppc, octeon, and sparc64.
> > > >
> > > > Compile- and boot-tests on other platforms (alpha, i386, luna88k,
> > > > riscv64, sh) would be appreciated, but the last time I asked for tests
> > > > I got zero reports back.
> > >
> > > i386 compiles and boots.
> >
> > Great!
> >
> > > as reported in separate mail, riscv64 doesn't compile.
> >
> > I think we're missing a 'struct user' definition on riscv64.  Can you
> > try this?
> >
> 
> compiles and boots ok.

Good, thanks!

With i386 and riscv64 covered, I'm feeling more confident.  I'm now
looking for OKs for the attached patch.

ok?

Index: kern/kern_clock.c
===
RCS file: /cvs/src/sys/kern/kern_clock.c,v
retrieving revision 1.108
diff -u -p -r1.108 kern_clock.c
--- kern/kern_clock.c   25 Apr 2023 00:58:47 -  1.108
+++ kern/kern_clock.c   19 Jul 2023 14:33:04 -
@@ -49,10 +49,6 @@
 #include 
 #include 
 
-#if defined(GPROF) || defined(DDBPROF)
-#include 
-#endif
-
 #include "dt.h"
 #if NDT > 0
 #include 
@@ -87,8 +83,6 @@ int   schedhz;
 intprofhz;
 intprofprocs;
 intticks = INT_MAX - (15 * 60 * HZ);
-static int psdiv, pscnt;   /* prof => stat divider */
-intpsratio;/* ratio: prof / stat */
 
 volatile unsigned long jiffies = ULONG_MAX - (10 * 60 * HZ);
 
@@ -99,16 +93,13 @@ void
 initclocks(void)
 {
/*
-* Set divisors to 1 (normal case) and let the machine-specific
-* code do its bit.
+* Let the machine-specific code do its bit.
 */
-   psdiv = pscnt = 1;
cpu_initclocks();
 
-   /*
-* Compute profhz/stathz.
-*/
-   psratio = profhz / stathz;
+   KASSERT(profhz >= stathz && profhz <= 10);
+   KASSERT(profhz % stathz == 0);
+   profclock_period = 10 / profhz;
 
inittimecounter();
 }
@@ -256,7 +247,6 @@ startprofclock(struct process *pr)
atomic_setbits_int(>ps_flags, PS_PROFIL);
if (++profprocs == 1) {
s = splstatclock();
-   psdiv = pscnt = psratio;
setstatclockrate(profhz);
splx(s);
}
@@ -275,7 +265,6 @@ stopprofclock(struct process *pr)
atomic_clearbits_int(>ps_flags, PS_PROFIL);
if (--profprocs == 0) {
s = splstatclock();
-   psdiv = pscnt = 1;
setstatclockrate(stathz);
splx(s);
}
@@ -289,35 +278,13 @@ stopprofclock(struct process *pr)
 void
 statclock(struct clockframe *frame)
 {
-#if defined(GPROF) || defined(DDBPROF)
-   struct gmonparam *g;
-   u_long i;
-#endif
struct cpu_info *ci = curcpu();
struct schedstate_percpu *spc = >ci_schedstate;
struct proc *p = curproc;
struct process *pr;
 
-   /*
-* Notice changes in divisor frequency, and adjust clock
-* frequency accordingly.
-*/
-   if (spc->spc_psdiv != psdiv) {
-   spc->spc_psdiv = psdiv;
-   spc->spc_pscnt = psdiv;
-   if (psdiv == 1) {
-  

Re: [v2] statclock: move profil(2), GPROF code into other clock interrupts

2023-07-20 Thread Scott Cheloha
On Wed, Jul 19, 2023 at 05:09:04AM +, Mike Larkin wrote:
> On Tue, Jul 18, 2023 at 08:21:41AM -0500, Scott Cheloha wrote:
> > This patch moves the profil(2)- and GPROF-specific parts of
> > statclock() out into into separate clock interrupt routines.  The
> > profil(2) part moves into profclock() and is enabled/disabled as
> > needed during mi_switch().  The GPROF part moves into gmonclock() and
> > is enabled/disabled as needed via sysctl(2).
> >
> > Moving those parts out of statclock() eliminates the need for an
> > effective statclock frequency and we can delete all the junk related
> > to that: psratio/psdiv/pscnt and corresponding members of
> > schedstate_percpu, clockintr_setstatclockrate(), a bunch of other
> > clockintr-internal code
> >
> > In separate commits I have addressed:
> >
> > - General GPROF instability on amd64
> > - GPROF causing a crash during suspend/resume
> > - CTASSERT breakage on amd64 related to schedstate_percpu
> >   changes in this patch
> >
> > This has been kicking around for over two months.  Personally, I have
> > tested it on amd64, arm64, macppc, octeon, and sparc64.
> >
> > Compile- and boot-tests on other platforms (alpha, i386, luna88k,
> > riscv64, sh) would be appreciated, but the last time I asked for tests
> > I got zero reports back.
> 
> i386 compiles and boots.

Great!

> as reported in separate mail, riscv64 doesn't compile.

I think we're missing a 'struct user' definition on riscv64.  Can you
try this?

Index: kern/kern_clock.c
===
RCS file: /cvs/src/sys/kern/kern_clock.c,v
retrieving revision 1.108
diff -u -p -r1.108 kern_clock.c
--- kern/kern_clock.c   25 Apr 2023 00:58:47 -  1.108
+++ kern/kern_clock.c   19 Jul 2023 14:33:04 -
@@ -49,10 +49,6 @@
 #include 
 #include 
 
-#if defined(GPROF) || defined(DDBPROF)
-#include 
-#endif
-
 #include "dt.h"
 #if NDT > 0
 #include 
@@ -87,8 +83,6 @@ int   schedhz;
 intprofhz;
 intprofprocs;
 intticks = INT_MAX - (15 * 60 * HZ);
-static int psdiv, pscnt;   /* prof => stat divider */
-intpsratio;/* ratio: prof / stat */
 
 volatile unsigned long jiffies = ULONG_MAX - (10 * 60 * HZ);
 
@@ -99,16 +93,13 @@ void
 initclocks(void)
 {
/*
-* Set divisors to 1 (normal case) and let the machine-specific
-* code do its bit.
+* Let the machine-specific code do its bit.
 */
-   psdiv = pscnt = 1;
cpu_initclocks();
 
-   /*
-* Compute profhz/stathz.
-*/
-   psratio = profhz / stathz;
+   KASSERT(profhz >= stathz && profhz <= 10);
+   KASSERT(profhz % stathz == 0);
+   profclock_period = 10 / profhz;
 
inittimecounter();
 }
@@ -256,7 +247,6 @@ startprofclock(struct process *pr)
atomic_setbits_int(>ps_flags, PS_PROFIL);
if (++profprocs == 1) {
s = splstatclock();
-   psdiv = pscnt = psratio;
setstatclockrate(profhz);
splx(s);
}
@@ -275,7 +265,6 @@ stopprofclock(struct process *pr)
atomic_clearbits_int(>ps_flags, PS_PROFIL);
if (--profprocs == 0) {
s = splstatclock();
-   psdiv = pscnt = 1;
setstatclockrate(stathz);
splx(s);
}
@@ -289,35 +278,13 @@ stopprofclock(struct process *pr)
 void
 statclock(struct clockframe *frame)
 {
-#if defined(GPROF) || defined(DDBPROF)
-   struct gmonparam *g;
-   u_long i;
-#endif
struct cpu_info *ci = curcpu();
struct schedstate_percpu *spc = >ci_schedstate;
struct proc *p = curproc;
struct process *pr;
 
-   /*
-* Notice changes in divisor frequency, and adjust clock
-* frequency accordingly.
-*/
-   if (spc->spc_psdiv != psdiv) {
-   spc->spc_psdiv = psdiv;
-   spc->spc_pscnt = psdiv;
-   if (psdiv == 1) {
-   setstatclockrate(stathz);
-   } else {
-   setstatclockrate(profhz);
-   }
-   }
-
if (CLKF_USERMODE(frame)) {
pr = p->p_p;
-   if (pr->ps_flags & PS_PROFIL)
-   addupc_intr(p, CLKF_PC(frame), 1);
-   if (--spc->spc_pscnt > 0)
-   return;
/*
 * Came from user mode; CPU was in user state.
 * If this process is being profiled record the tick.
@@ -328,23 +295,6 @@ statclock(struct clockframe *frame)
else
spc->

[v2] statclock: move profil(2), GPROF code into other clock interrupts

2023-07-18 Thread Scott Cheloha
This patch moves the profil(2)- and GPROF-specific parts of
statclock() out into into separate clock interrupt routines.  The
profil(2) part moves into profclock() and is enabled/disabled as
needed during mi_switch().  The GPROF part moves into gmonclock() and
is enabled/disabled as needed via sysctl(2).

Moving those parts out of statclock() eliminates the need for an
effective statclock frequency and we can delete all the junk related
to that: psratio/psdiv/pscnt and corresponding members of
schedstate_percpu, clockintr_setstatclockrate(), a bunch of other
clockintr-internal code

In separate commits I have addressed:

- General GPROF instability on amd64
- GPROF causing a crash during suspend/resume
- CTASSERT breakage on amd64 related to schedstate_percpu
  changes in this patch

This has been kicking around for over two months.  Personally, I have
tested it on amd64, arm64, macppc, octeon, and sparc64.

Compile- and boot-tests on other platforms (alpha, i386, luna88k,
riscv64, sh) would be appreciated, but the last time I asked for tests
I got zero reports back.

I don't know how to proceed.

FWIW, GPROF is not enabled in any default kernel configurations and
profil(2) is more-or-less useless (and painful to test) until I finish
changing the libc gmon code and gprof(1).  So, the patch is low-risk.

v1: https://marc.info/?l=openbsd-tech=168721453821801=2

Index: kern/kern_clock.c
===
RCS file: /cvs/src/sys/kern/kern_clock.c,v
retrieving revision 1.108
diff -u -p -r1.108 kern_clock.c
--- kern/kern_clock.c   25 Apr 2023 00:58:47 -  1.108
+++ kern/kern_clock.c   18 Jul 2023 13:14:27 -
@@ -49,10 +49,6 @@
 #include 
 #include 
 
-#if defined(GPROF) || defined(DDBPROF)
-#include 
-#endif
-
 #include "dt.h"
 #if NDT > 0
 #include 
@@ -87,8 +83,6 @@ int   schedhz;
 intprofhz;
 intprofprocs;
 intticks = INT_MAX - (15 * 60 * HZ);
-static int psdiv, pscnt;   /* prof => stat divider */
-intpsratio;/* ratio: prof / stat */
 
 volatile unsigned long jiffies = ULONG_MAX - (10 * 60 * HZ);
 
@@ -99,16 +93,13 @@ void
 initclocks(void)
 {
/*
-* Set divisors to 1 (normal case) and let the machine-specific
-* code do its bit.
+* Let the machine-specific code do its bit.
 */
-   psdiv = pscnt = 1;
cpu_initclocks();
 
-   /*
-* Compute profhz/stathz.
-*/
-   psratio = profhz / stathz;
+   KASSERT(profhz >= stathz && profhz <= 10);
+   KASSERT(profhz % stathz == 0);
+   profclock_period = 10 / profhz;
 
inittimecounter();
 }
@@ -256,7 +247,6 @@ startprofclock(struct process *pr)
atomic_setbits_int(>ps_flags, PS_PROFIL);
if (++profprocs == 1) {
s = splstatclock();
-   psdiv = pscnt = psratio;
setstatclockrate(profhz);
splx(s);
}
@@ -275,7 +265,6 @@ stopprofclock(struct process *pr)
atomic_clearbits_int(>ps_flags, PS_PROFIL);
if (--profprocs == 0) {
s = splstatclock();
-   psdiv = pscnt = 1;
setstatclockrate(stathz);
splx(s);
}
@@ -289,35 +278,13 @@ stopprofclock(struct process *pr)
 void
 statclock(struct clockframe *frame)
 {
-#if defined(GPROF) || defined(DDBPROF)
-   struct gmonparam *g;
-   u_long i;
-#endif
struct cpu_info *ci = curcpu();
struct schedstate_percpu *spc = >ci_schedstate;
struct proc *p = curproc;
struct process *pr;
 
-   /*
-* Notice changes in divisor frequency, and adjust clock
-* frequency accordingly.
-*/
-   if (spc->spc_psdiv != psdiv) {
-   spc->spc_psdiv = psdiv;
-   spc->spc_pscnt = psdiv;
-   if (psdiv == 1) {
-   setstatclockrate(stathz);
-   } else {
-   setstatclockrate(profhz);
-   }
-   }
-
if (CLKF_USERMODE(frame)) {
pr = p->p_p;
-   if (pr->ps_flags & PS_PROFIL)
-   addupc_intr(p, CLKF_PC(frame), 1);
-   if (--spc->spc_pscnt > 0)
-   return;
/*
 * Came from user mode; CPU was in user state.
 * If this process is being profiled record the tick.
@@ -328,23 +295,6 @@ statclock(struct clockframe *frame)
else
spc->spc_cp_time[CP_USER]++;
} else {
-#if defined(GPROF) || defined(DDBPROF)
-   /*
-* Kernel statistics are just like addupc_intr, only easier.
-*/
-   g = ci->ci_gmon;
-   if (g != NULL && g->state == GMON_PROF_ON) {
-   i = 

Re: GPROF: sleep_state: disable _mcount() across suspend/resume

2023-07-11 Thread Scott Cheloha
On Mon, Jul 10, 2023 at 10:41:15AM -0500, Scott Cheloha wrote:
> On Mon, Jul 10, 2023 at 05:19:35PM +0200, Mark Kettenis wrote:
> > > Date: Mon, 10 Jul 2023 09:57:39 -0500
> > > From: Scott Cheloha 
> > > 
> > > On Mon, Jul 10, 2023 at 07:42:55AM -0600, Theo de Raadt wrote:
> > > > I dare you to write the simplest fix for this, instead of a diff that
> > > > scrolls by.
> > > 
> > > This patch seems to work.  Will need to bang on it for a few more days.
> > > 
> > > 1. Disable gmoninit after sched_stop_scondary_cpus().  The secondary
> > >CPUs have halted, so we aren't racing sysctl(2) on a secondary CPU.
> > > 
> > > 2. Restore gmoninit between resume_mp() and sched_start_secondary_cpus().
> > >The secondary CPUs are out of cpu_hatch(), which is probably where we
> > >are crashing during resume.  The secondary CPUs haven't started
> > >scheduling yet, so we aren't racing sysctl(2).
> > 
> > It is still a bit scary to have cpu_hatch() call _mcount() but I guess
> > adding __attribute__((no_profile)) to all of the functions called by
> > cpu_hatch() isn't really workable either.
> > 
> > That said...
> > 
> > > Index: subr_suspend.c
> > > ===
> > > RCS file: /cvs/src/sys/kern/subr_suspend.c,v
> > > retrieving revision 1.15
> > > diff -u -p -r1.15 subr_suspend.c
> > > --- subr_suspend.c2 Jul 2023 19:02:27 -   1.15
> > > +++ subr_suspend.c10 Jul 2023 14:51:01 -
> > > @@ -18,6 +18,7 @@
> > >  
> > >  #include 
> > >  #include 
> > > +#include 
> > >  #include 
> > >  #include 
> > >  #include 
> > > @@ -100,6 +101,14 @@ top:
> > >  #ifdef MULTIPROCESSOR
> > >   sched_stop_secondary_cpus();
> > >   KASSERT(CPU_IS_PRIMARY(curcpu()));
> > > +#endif
> > > +#ifdef GPROF
> > > + extern int gmoninit;
> > > + int gmon_state = gmoninit;
> > 
> > No variable declarations in the middle of functions please.
> 
> Yep, moved up.
> 
> > > + gmoninit = 0;
> > > + membar_producer();
> > 
> > Why are you messing with memory barriers here?
> > 
> > > +#endif
> > > +#ifdef MULTIPROCESSOR
> > >   sleep_mp();
> > >  #endif
> > >  
> > > @@ -172,6 +181,12 @@ fail_suspend:
> > >   resume_randomness(rndbuf, rndbuflen);
> > >  #ifdef MULTIPROCESSOR
> > >   resume_mp();
> > > +#endif
> > > +#ifdef GPROF
> > > + gmoninit = gmon_state;
> > > + membar_producer();
> > 
> > And here?
> 
> gmoninit is not volatile.  I had this idea that I wanted the store to
> be globally visible before proceeding.  But you and Theo are telling
> me I don't need the barrier.  I'll take your word for it, dropped.

... is this ok or is something still amiss here?

Index: subr_suspend.c
===
RCS file: /cvs/src/sys/kern/subr_suspend.c,v
retrieving revision 1.15
diff -u -p -r1.15 subr_suspend.c
--- subr_suspend.c  2 Jul 2023 19:02:27 -   1.15
+++ subr_suspend.c  11 Jul 2023 20:27:50 -
@@ -26,6 +26,9 @@
 #include 
 #include 
 #include 
+#ifdef GPROF
+#include 
+#endif
 #ifdef HIBERNATE
 #include 
 #endif
@@ -49,6 +52,9 @@ sleep_state(void *v, int sleepmode)
extern int perflevel;
size_t rndbuflen;
char *rndbuf;
+#ifdef GPROF
+   int gmon_state;
+#endif
 #if NSOFTRAID > 0
extern void sr_quiesce(void);
 #endif
@@ -100,6 +106,12 @@ top:
 #ifdef MULTIPROCESSOR
sched_stop_secondary_cpus();
KASSERT(CPU_IS_PRIMARY(curcpu()));
+#endif
+#ifdef GPROF
+   gmon_state = gmoninit;
+   gmoninit = 0;
+#endif
+#ifdef MULTIPROCESSOR
sleep_mp();
 #endif
 
@@ -172,6 +184,11 @@ fail_suspend:
resume_randomness(rndbuf, rndbuflen);
 #ifdef MULTIPROCESSOR
resume_mp();
+#endif
+#ifdef GPROF
+   gmoninit = gmon_state;
+#endif
+#ifdef MULTIPROCESSOR
sched_start_secondary_cpus();
 #endif
vfs_stall(curproc, 0);



Re: GPROF: sleep_state: disable _mcount() across suspend/resume

2023-07-10 Thread Scott Cheloha
On Mon, Jul 10, 2023 at 05:19:35PM +0200, Mark Kettenis wrote:
> > Date: Mon, 10 Jul 2023 09:57:39 -0500
> > From: Scott Cheloha 
> > 
> > On Mon, Jul 10, 2023 at 07:42:55AM -0600, Theo de Raadt wrote:
> > > I dare you to write the simplest fix for this, instead of a diff that
> > > scrolls by.
> > 
> > This patch seems to work.  Will need to bang on it for a few more days.
> > 
> > 1. Disable gmoninit after sched_stop_scondary_cpus().  The secondary
> >CPUs have halted, so we aren't racing sysctl(2) on a secondary CPU.
> > 
> > 2. Restore gmoninit between resume_mp() and sched_start_secondary_cpus().
> >The secondary CPUs are out of cpu_hatch(), which is probably where we
> >are crashing during resume.  The secondary CPUs haven't started
> >scheduling yet, so we aren't racing sysctl(2).
> 
> It is still a bit scary to have cpu_hatch() call _mcount() but I guess
> adding __attribute__((no_profile)) to all of the functions called by
> cpu_hatch() isn't really workable either.
> 
> That said...
> 
> > Index: subr_suspend.c
> > ===
> > RCS file: /cvs/src/sys/kern/subr_suspend.c,v
> > retrieving revision 1.15
> > diff -u -p -r1.15 subr_suspend.c
> > --- subr_suspend.c  2 Jul 2023 19:02:27 -   1.15
> > +++ subr_suspend.c  10 Jul 2023 14:51:01 -
> > @@ -18,6 +18,7 @@
> >  
> >  #include 
> >  #include 
> > +#include 
> >  #include 
> >  #include 
> >  #include 
> > @@ -100,6 +101,14 @@ top:
> >  #ifdef MULTIPROCESSOR
> > sched_stop_secondary_cpus();
> > KASSERT(CPU_IS_PRIMARY(curcpu()));
> > +#endif
> > +#ifdef GPROF
> > +   extern int gmoninit;
> > +   int gmon_state = gmoninit;
> 
> No variable declarations in the middle of functions please.

Yep, moved up.

> > +   gmoninit = 0;
> > +   membar_producer();
> 
> Why are you messing with memory barriers here?
> 
> > +#endif
> > +#ifdef MULTIPROCESSOR
> > sleep_mp();
> >  #endif
> >  
> > @@ -172,6 +181,12 @@ fail_suspend:
> > resume_randomness(rndbuf, rndbuflen);
> >  #ifdef MULTIPROCESSOR
> > resume_mp();
> > +#endif
> > +#ifdef GPROF
> > +   gmoninit = gmon_state;
> > +   membar_producer();
> 
> And here?

gmoninit is not volatile.  I had this idea that I wanted the store to
be globally visible before proceeding.  But you and Theo are telling
me I don't need the barrier.  I'll take your word for it, dropped.

--

suspend/resume and hibernate/unhibernate still work with the revised
patch below.

Index: subr_suspend.c
===
RCS file: /cvs/src/sys/kern/subr_suspend.c,v
retrieving revision 1.15
diff -u -p -r1.15 subr_suspend.c
--- subr_suspend.c  2 Jul 2023 19:02:27 -   1.15
+++ subr_suspend.c  10 Jul 2023 15:30:22 -
@@ -26,6 +26,9 @@
 #include 
 #include 
 #include 
+#ifdef GPROF
+#include 
+#endif
 #ifdef HIBERNATE
 #include 
 #endif
@@ -49,6 +52,9 @@ sleep_state(void *v, int sleepmode)
extern int perflevel;
size_t rndbuflen;
char *rndbuf;
+#ifdef GPROF
+   int gmon_state;
+#endif
 #if NSOFTRAID > 0
extern void sr_quiesce(void);
 #endif
@@ -100,6 +106,12 @@ top:
 #ifdef MULTIPROCESSOR
sched_stop_secondary_cpus();
KASSERT(CPU_IS_PRIMARY(curcpu()));
+#endif
+#ifdef GPROF
+   gmon_state = gmoninit;
+   gmoninit = 0;
+#endif
+#ifdef MULTIPROCESSOR
sleep_mp();
 #endif
 
@@ -172,6 +184,11 @@ fail_suspend:
resume_randomness(rndbuf, rndbuflen);
 #ifdef MULTIPROCESSOR
resume_mp();
+#endif
+#ifdef GPROF
+   gmoninit = gmon_state;
+#endif
+#ifdef MULTIPROCESSOR
sched_start_secondary_cpus();
 #endif
vfs_stall(curproc, 0);



Re: GPROF: sleep_state: disable _mcount() across suspend/resume

2023-07-10 Thread Scott Cheloha
On Mon, Jul 10, 2023 at 07:42:55AM -0600, Theo de Raadt wrote:
> I dare you to write the simplest fix for this, instead of a diff that
> scrolls by.

This patch seems to work.  Will need to bang on it for a few more days.

1. Disable gmoninit after sched_stop_scondary_cpus().  The secondary
   CPUs have halted, so we aren't racing sysctl(2) on a secondary CPU.

2. Restore gmoninit between resume_mp() and sched_start_secondary_cpus().
   The secondary CPUs are out of cpu_hatch(), which is probably where we
   are crashing during resume.  The secondary CPUs haven't started
   scheduling yet, so we aren't racing sysctl(2).

Index: subr_suspend.c
===
RCS file: /cvs/src/sys/kern/subr_suspend.c,v
retrieving revision 1.15
diff -u -p -r1.15 subr_suspend.c
--- subr_suspend.c  2 Jul 2023 19:02:27 -   1.15
+++ subr_suspend.c  10 Jul 2023 14:51:01 -
@@ -18,6 +18,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -100,6 +101,14 @@ top:
 #ifdef MULTIPROCESSOR
sched_stop_secondary_cpus();
KASSERT(CPU_IS_PRIMARY(curcpu()));
+#endif
+#ifdef GPROF
+   extern int gmoninit;
+   int gmon_state = gmoninit;
+   gmoninit = 0;
+   membar_producer();
+#endif
+#ifdef MULTIPROCESSOR
sleep_mp();
 #endif
 
@@ -172,6 +181,12 @@ fail_suspend:
resume_randomness(rndbuf, rndbuflen);
 #ifdef MULTIPROCESSOR
resume_mp();
+#endif
+#ifdef GPROF
+   gmoninit = gmon_state;
+   membar_producer();
+#endif
+#ifdef MULTIPROCESSOR
sched_start_secondary_cpus();
 #endif
vfs_stall(curproc, 0);



Re: GPROF: sleep_state: disable _mcount() across suspend/resume

2023-07-10 Thread Scott Cheloha
On Mon, Jul 10, 2023 at 07:09:19AM -0600, Theo de Raadt wrote:
> Mark Kettenis  wrote:
> 
> > So isn't the real problem that some of the lower-level code involved
> > in the resume path isn't properly marked to not do the
> > instrumentation?  Traditionally that was assembly code and we'd use
> > NENTRY() (in amd64) or ENTRY_NP() (on some other architectures) to
> > prevent thise functions from calling _mcount().  But that was only
> > ever done for code used during early bootstrap of the kernel.  And
> > these days there may be C code that needs this as well.
> > 
> > With your diff, functions in the suspend/resume path will still call
> > _mcount() which may not be safe.
> 
> I guess you can make critical functions not do _PROF_PROLOGUE
> or you can make __mcount or _mcount aware that they should "do nothing",
> or "nothing scary".
> 
> Hell, save & toggle the 'gmoninit' variable during the suspend/resume
> sequence, and then adjust one comment:
> 
> /*
>  * Do not profile execution if memory for the current CPU
>  * descriptor and profiling buffers has not yet been allocated
>  * or if the CPU we are running on has not yet set its trap
> -* handler
> +* handler, or disabled during a suspend/resume sequence
>  */
> if (gmoninit == 0)
> return;
> 
> 
> Does this really need another variable?
> 
> It feels like this can be 4 1-line diffs.

Secondary CPUs are still running at the top of sleep_state().  To
disable _mcount with gmoninit we would need to wait until after
secondary CPUs have halted to toggle it off, which is way further into
sleep_state().

Then, on the resume side, you need to keep the secondary CPUs from
toggling gmoninit back on until after all other secondary CPUs have
finished restarting, which I think means changing how we do
sched_start_secondary_cpus().

... given all of that, I thought adding a second variable was easier
and less likely to break something more important than GPROF.



Re: GPROF: sleep_state: disable _mcount() across suspend/resume

2023-07-09 Thread Scott Cheloha
On Sun, Jul 09, 2023 at 05:24:43PM -0500, Scott Cheloha wrote:
> On Sun, Jul 09, 2023 at 08:11:43PM +0200, Claudio Jeker wrote:
> > On Sun, Jul 09, 2023 at 12:52:20PM -0500, Scott Cheloha wrote:
> > > This patch fixes resume/unhibernate on GPROF kernels where kgmon(8)
> > > has activated kernel profiling.
> > > 
> > > I think the problem is that code called from cpu_hatch() does not play
> > > nicely with _mcount(), so GPROF kernels crash during resume.  I can't
> > > point you to which code in particular, but keeping all CPUs out of
> > > _mcount() until the primary CPU has completed resume/unhibernate fixes
> > > the crash.
> > > 
> > > ok?
> > 
> > To be honest, I'm not sure we need something like this. GPROF is already a
> > special case and poeple running a GPROF kernel should probably stop the
> > collection of profile data before suspend/hibernate.
> 
> [..]

Also, deraadt@ insisted that GPROF not hang the system across
suspend/resume and hibernate/unhibernate:

1. https://marc.info/?l=openbsd-tech=168721604322193=2

>>  Make sure to STOP all kernel profiling before attempting to
>>  suspend or hibernate your machine.  Otherwise I expect it
>>  will hang.
> 
> That is not acceptable.  People suspend and hibernate machines without
> being aware of what applications are doing.
> 
>>  GPROF is a kernel compile-time option.  If you don't enable it,
>>  you have nothing to worry about.
> 
> Well that's a great hidden reason why noone would ever turn on this
> subsystem -- so why is it getting done on it??

2. https://marc.info/?l=openbsd-tech=16872161214=2

>>  Make sure to STOP all kernel profiling before attempting to
>>  suspend or hibernate your machine.  Otherwise I expect it
>>  will hang.
> 
> It is completely acceptable if it produces wrong results, but it must
> not hang the system.

... this patch fits the bill.

Index: sys/lib/libkern/mcount.c
===
RCS file: /cvs/src/sys/lib/libkern/mcount.c,v
retrieving revision 1.14
diff -u -p -r1.14 mcount.c
--- sys/lib/libkern/mcount.c11 Jan 2022 09:21:34 -  1.14
+++ sys/lib/libkern/mcount.c9 Jul 2023 22:03:22 -
@@ -33,6 +33,28 @@
 #include 
 #include 
 
+#ifdef _KERNEL
+#ifdef SUSPEND
+#include 
+
+volatile int mcount_disabled;
+
+void
+mcount_disable(void)
+{
+   mcount_disabled = 1;
+   membar_producer();
+}
+
+void
+mcount_enable(void)
+{
+   mcount_disabled = 0;
+   membar_producer();
+}
+#endif /* SUSPEND */
+#endif /* _KERNEL */
+
 /*
  * mcount is called on entry to each function compiled with the profiling
  * switch set.  _mcount(), which is declared in a machine-dependent way
@@ -63,7 +85,10 @@ _MCOUNT_DECL(u_long frompc, u_long selfp
 */
if (gmoninit == 0)
return;
-
+#ifdef SUSPEND
+   if (mcount_disabled)
+   return;
+#endif
if ((p = curcpu()->ci_gmon) == NULL)
return;
 #else
Index: sys/kern/subr_suspend.c
===
RCS file: /cvs/src/sys/kern/subr_suspend.c,v
retrieving revision 1.15
diff -u -p -r1.15 subr_suspend.c
--- sys/kern/subr_suspend.c 2 Jul 2023 19:02:27 -   1.15
+++ sys/kern/subr_suspend.c 9 Jul 2023 22:03:22 -
@@ -26,6 +26,9 @@
 #include 
 #include 
 #include 
+#ifdef GPROF
+#include 
+#endif
 #ifdef HIBERNATE
 #include 
 #endif
@@ -63,6 +66,9 @@ top:
 
if (sleep_showstate(v, sleepmode))
return EOPNOTSUPP;
+#ifdef GPROF
+   mcount_disable();
+#endif
 #if NWSDISPLAY > 0
wsdisplay_suspend();
 #endif
@@ -192,6 +198,9 @@ fail_hiballoc:
start_periodic_resettodr();
 #if NWSDISPLAY > 0
wsdisplay_resume();
+#endif
+#ifdef GPROF
+   mcount_enable();
 #endif
sys_sync(curproc, NULL, NULL);
if (cpu_setperf != NULL)
Index: sys/sys/gmon.h
===
RCS file: /cvs/src/sys/sys/gmon.h,v
retrieving revision 1.9
diff -u -p -r1.9 gmon.h
--- sys/sys/gmon.h  11 Jan 2022 23:59:55 -  1.9
+++ sys/sys/gmon.h  9 Jul 2023 22:03:22 -
@@ -158,6 +158,10 @@ struct gmonparam {
 #ifdef _KERNEL
 extern int gmoninit;   /* Is the kernel ready for being profiled? */
 
+#ifdef SUSPEND
+void mcount_disable(void);
+void mcount_enable(void);
+#endif
 #else /* !_KERNEL */
 
 #include 
Index: lib/libc/gmon/mcount.c
===
RCS file: /cvs/src/lib/libc/gmon/mcount.c,v
retrieving revision 1.16
diff -u -p -r1.16 mcount.c
--- lib/libc/gmon/mcount.c  11 Jan 2022 09:21:34 -  1.16
+++ lib/libc/gmon/mcount.c  9 Jul 2023 22:03:23 -
@@ -31,6 +31,28 @@
 #i

Re: GPROF: sleep_state: disable _mcount() across suspend/resume

2023-07-09 Thread Scott Cheloha
On Sun, Jul 09, 2023 at 08:11:43PM +0200, Claudio Jeker wrote:
> On Sun, Jul 09, 2023 at 12:52:20PM -0500, Scott Cheloha wrote:
> > This patch fixes resume/unhibernate on GPROF kernels where kgmon(8)
> > has activated kernel profiling.
> > 
> > I think the problem is that code called from cpu_hatch() does not play
> > nicely with _mcount(), so GPROF kernels crash during resume.  I can't
> > point you to which code in particular, but keeping all CPUs out of
> > _mcount() until the primary CPU has completed resume/unhibernate fixes
> > the crash.
> > 
> > ok?
> 
> To be honest, I'm not sure we need something like this. GPROF is already a
> special case and poeple running a GPROF kernel should probably stop the
> collection of profile data before suspend/hibernate.

Sorry, I was a little unclear in my original mail.

When I say "has activated kernel profiling" I mean "has *ever*
activated kernel profiling".

Regardless of whether or not profiling is active at the moment we
reach sleep_state(), if kernel profiling has *ever* been activated in
the past, the resume crashes.

When sysctl(2) reaches sysctl_doprof(), gmoninit is set.  "Assume that
if we're here it is safe to execute profiling":

   166  /*
   167   * Return kernel profiling information.
   168   */
   169  int
   170  sysctl_doprof(int *name, u_int namelen, void *oldp, size_t *oldlenp, 
void *newp,
   171  size_t newlen)
   172  {
   173  CPU_INFO_ITERATOR cii;
   174  struct cpu_info *ci;
   175  struct gmonparam *gp = NULL;
   176  int error, cpuid, op, state;
   177  
   178  /* all sysctl names at this level are name and field */
   179  if (namelen != 2)
   180  return (ENOTDIR);   /* overloaded */
   181  
   182  op = name[0];
   183  cpuid = name[1];
   184  
   185  CPU_INFO_FOREACH(cii, ci) {
   186  if (cpuid == CPU_INFO_UNIT(ci)) {
   187  gp = ci->ci_gmon;
   188  break;
   189  }
   190  }
   191  
   192  if (gp == NULL)
   193  return (EOPNOTSUPP);
   194  
   195  /* Assume that if we're here it is safe to execute profiling. */
   196  gmoninit = 1;

After that first sysctl(2), all CPUs will stop bouncing out of
_mcount() at the gmoninit check and starting checking per-CPU data
structures to decide whether or not to record the arc.  "Do not
profile execution...".

73  _MCOUNT_DECL(u_long frompc, u_long selfpc) __used;
74  /* _mcount; may be static, inline, etc */
75  _MCOUNT_DECL(u_long frompc, u_long selfpc)
76  {
77  u_short *frompcindex;
78  struct tostruct *top, *prevtop;
79  struct gmonparam *p;
80  long toindex;
81  #ifdef _KERNEL
82  int s;
83  
84  /*
85   * Do not profile execution if memory for the current CPU
86   * descriptor and profiling buffers has not yet been allocated
87   * or if the CPU we are running on has not yet set its trap
88   * handler.
89   */
90  if (gmoninit == 0)
91  return;
92  #ifdef SUSPEND
93  if (mcount_disabled)
94  return;
95  #endif
96  if ((p = curcpu()->ci_gmon) == NULL)
97  return;
98  #else
99  p = &_gmonparam;
   100  #endif
   101  /*
   102   * check that we are profiling
   103   * and that we aren't recursively invoked.
   104   */
   105  if (p->state != GMON_PROF_ON)
   106  return;
   107  #ifdef _KERNEL
   108  MCOUNT_ENTER;

This patch adds a second check to _mcount(), mcount_disabled, which
has a distinct meaning.

gmoninit means:

The boot initialization is now done and it is safe to touch the
per-CPU GPROF data structures.

mcount_disabled says:

Keep out.

There may be a clever way to merge the two variables, but the simplest
thing I could think of was to just add a second boolean.  The current
patch is idiot-proof.

> Unless someone wants to gprof suspend/hibernate but then doing this will
> result in bad data.

That's way out of scope, I'm not advocating for that.  If you want to
profile sleep_state() you need to do it some other way.

> Another option is to abort zzz/ZZZ if kernel profiling is running.

I don't think this would be a good user experience.  Performing the
suspend without question and possibly returning bad profiling results
is better than failing the suspend.

IMHO, if suspend/resume interferes with kgmon(8) yielding accurate
results we just need to document it in kgmon.8.

I would argue that suspend/resume is an edge case o

GPROF: sleep_state: disable _mcount() across suspend/resume

2023-07-09 Thread Scott Cheloha
This patch fixes resume/unhibernate on GPROF kernels where kgmon(8)
has activated kernel profiling.

I think the problem is that code called from cpu_hatch() does not play
nicely with _mcount(), so GPROF kernels crash during resume.  I can't
point you to which code in particular, but keeping all CPUs out of
_mcount() until the primary CPU has completed resume/unhibernate fixes
the crash.

ok?

Index: sys/lib/libkern/mcount.c
===
RCS file: /cvs/src/sys/lib/libkern/mcount.c,v
retrieving revision 1.14
diff -u -p -r1.14 mcount.c
--- sys/lib/libkern/mcount.c11 Jan 2022 09:21:34 -  1.14
+++ sys/lib/libkern/mcount.c9 Jul 2023 17:49:55 -
@@ -33,6 +33,32 @@
 #include 
 #include 
 
+#ifdef _KERNEL
+#ifdef SUSPEND
+#include 
+
+#include/* KASSERT */
+
+volatile int mcount_disabled;
+
+void
+mcount_disable(void)
+{
+   KASSERT(CPU_IS_PRIMARY(curcpu()));
+   mcount_disabled = 1;
+   membar_producer();
+}
+
+void
+mcount_enable(void)
+{
+   KASSERT(CPU_IS_PRIMARY(curcpu()));
+   mcount_disabled = 0;
+   membar_producer();
+}
+#endif /* SUSPEND */
+#endif /* _KERNEL */
+
 /*
  * mcount is called on entry to each function compiled with the profiling
  * switch set.  _mcount(), which is declared in a machine-dependent way
@@ -63,7 +89,10 @@ _MCOUNT_DECL(u_long frompc, u_long selfp
 */
if (gmoninit == 0)
return;
-
+#ifdef SUSPEND
+   if (mcount_disabled)
+   return;
+#endif
if ((p = curcpu()->ci_gmon) == NULL)
return;
 #else
Index: sys/kern/subr_suspend.c
===
RCS file: /cvs/src/sys/kern/subr_suspend.c,v
retrieving revision 1.15
diff -u -p -r1.15 subr_suspend.c
--- sys/kern/subr_suspend.c 2 Jul 2023 19:02:27 -   1.15
+++ sys/kern/subr_suspend.c 9 Jul 2023 17:49:55 -
@@ -26,6 +26,9 @@
 #include 
 #include 
 #include 
+#ifdef GPROF
+#include 
+#endif
 #ifdef HIBERNATE
 #include 
 #endif
@@ -63,6 +66,9 @@ top:
 
if (sleep_showstate(v, sleepmode))
return EOPNOTSUPP;
+#ifdef GPROF
+   mcount_disable();
+#endif
 #if NWSDISPLAY > 0
wsdisplay_suspend();
 #endif
@@ -192,6 +198,9 @@ fail_hiballoc:
start_periodic_resettodr();
 #if NWSDISPLAY > 0
wsdisplay_resume();
+#endif
+#ifdef GPROF
+   mcount_enable();
 #endif
sys_sync(curproc, NULL, NULL);
if (cpu_setperf != NULL)
Index: sys/sys/gmon.h
===
RCS file: /cvs/src/sys/sys/gmon.h,v
retrieving revision 1.9
diff -u -p -r1.9 gmon.h
--- sys/sys/gmon.h  11 Jan 2022 23:59:55 -  1.9
+++ sys/sys/gmon.h  9 Jul 2023 17:49:55 -
@@ -158,6 +158,10 @@ struct gmonparam {
 #ifdef _KERNEL
 extern int gmoninit;   /* Is the kernel ready for being profiled? */
 
+#ifdef SUSPEND
+void mcount_disable(void);
+void mcount_enable(void);
+#endif
 #else /* !_KERNEL */
 
 #include 
Index: lib/libc/gmon/mcount.c
===
RCS file: /cvs/src/lib/libc/gmon/mcount.c,v
retrieving revision 1.16
diff -u -p -r1.16 mcount.c
--- lib/libc/gmon/mcount.c  11 Jan 2022 09:21:34 -  1.16
+++ lib/libc/gmon/mcount.c  9 Jul 2023 17:49:55 -
@@ -31,6 +31,32 @@
 #include 
 #include 
 
+#ifdef _KERNEL
+#ifdef SUSPEND
+#include 
+
+#include/* KASSERT */
+
+volatile int mcount_disabled;
+
+void
+mcount_disable(void)
+{
+   KASSERT(CPU_IS_PRIMARY(curcpu()));
+   mcount_disabled = 1;
+   membar_producer();
+}
+
+void
+mcount_enable(void)
+{
+   KASSERT(CPU_IS_PRIMARY(curcpu()));
+   mcount_disabled = 0;
+   membar_producer();
+}
+#endif /* SUSPEND */
+#endif /* _KERNEL */
+
 /*
  * mcount is called on entry to each function compiled with the profiling
  * switch set.  _mcount(), which is declared in a machine-dependent way
@@ -61,7 +87,10 @@ _MCOUNT_DECL(u_long frompc, u_long selfp
 */
if (gmoninit == 0)
return;
-
+#ifdef SUSPEND
+   if (mcount_disabled)
+   return;
+#endif
if ((p = curcpu()->ci_gmon) == NULL)
return;
 #else



glxclk(4/loongson): remove driver

2023-07-05 Thread Scott Cheloha
glxclk(4) has been compiled-but-disabled for over six months.  It was
disabled when loongson made the clockintr switch.  Nobody has asked me
to make it an intrclock option for loongson so I assume the mips64 CP0
interrupt clock is sufficient.

This patch deletes the driver, driver config glue, manpage, and
manpage cross-references.  Not sure if I got it all.  I have no
system to test this with.

One thing I noticed: glxclk(4) is compiled into loongson GENERIC but
not loongson RAMDISK.  A bit odd for a clock interrupt driver, no?  I
figure you would want to be sure certain such a basic component was
working during installation, but maybe I'm missing something.

Anyway, did I get everything?  If so, ok?

Index: sys/arch/loongson/dev/glxclk.c
===
RCS file: sys/arch/loongson/dev/glxclk.c
diff -N sys/arch/loongson/dev/glxclk.c
--- sys/arch/loongson/dev/glxclk.c  19 Nov 2022 16:23:48 -  1.8
+++ /dev/null   1 Jan 1970 00:00:00 -
@@ -1,338 +0,0 @@
-/* $OpenBSD: glxclk.c,v 1.8 2022/11/19 16:23:48 cheloha Exp $  */
-
-/*
- * Copyright (c) 2013 Paul Irofti.
- *
- * Permission to use, copy, modify, and distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-
-#include 
-#include 
-#include 
-#include 
-
-#include 
-#include 
-
-#include 
-
-#include 
-#include 
-#include 
-
-#include 
-#include 
-
-struct glxclk_softc {
-   struct device   sc_dev;
-
-   bus_space_tag_t sc_iot;
-   bus_space_handle_t  sc_ioh;
-};
-
-struct cfdriver glxclk_cd = {
-   NULL, "glxclk", DV_DULL
-};
-
-intglxclk_match(struct device *, void *, void *);
-void   glxclk_attach(struct device *, struct device *, void *);
-intglxclk_intr(void *);
-intglxclk_stat_intr(void *arg);
-void   glxclk_startclock(struct cpu_info *);
-
-const struct cfattach glxclk_ca = {
-   sizeof(struct glxclk_softc), glxclk_match, glxclk_attach,
-};
-
-#defineMSR_LBAR_ENABLE 0x1ULL
-#defineMSR_LBAR_MFGPT  DIVIL_LBAR_MFGPT
-#defineMSR_MFGPT_SIZE  0x40
-#defineMSR_MFGPT_ADDR_MASK 0xffc0
-
-#defineAMD5536_MFGPT1_CMP2 0x000a  /* Compare value for 
CMP2 */
-#defineAMD5536_MFGPT1_CNT  0x000c  /* Up counter */
-#defineAMD5536_MFGPT1_SETUP0x000e  /* Setup register */
-#defineAMD5536_MFGPT1_SCALE0x7 /* Set to 128 */
-#defineAMD5536_MFGPT1_C2_IRQM  0x0200
-
-#defineAMD5536_MFGPT2_CMP2 0x0012  /* Compare value for 
CMP2 */
-#defineAMD5536_MFGPT2_CNT  0x0014  /* Up counter */
-#defineAMD5536_MFGPT2_SETUP0x0016  /* Setup register */
-#defineAMD5536_MFGPT2_SCALE0x3 /* Divide by 8 */
-#defineAMD5536_MFGPT2_C2_IRQM  0x0400
-
-#defineAMD5536_MFGPT_CNT_EN(1 << 15)   /* Enable counting */
-#defineAMD5536_MFGPT_CMP2  (1 << 14)   /* Compare 2 output */
-#defineAMD5536_MFGPT_CMP1  (1 << 13)   /* Compare 1 output */
-#define AMD5536_MFGPT_SETUP(1 << 12)   /* Set to 1 after 1st write */
-#defineAMD5536_MFGPT_STOP_EN   (1 << 11)   /* Stop enable */
-#defineAMD5536_MFGPT_CMP2MODE  (1 << 9)|(1 << 8)/* Set to GE + 
activate IRQ */
-#define AMD5536_MFGPT_CLKSEL   (1 << 4)/* Clock select 14MHz */
-
-
-struct glxclk_softc *glxclk_sc;
-
-/*
- * Statistics clock interval and variance, in usec.  Variance must be a
- * power of two.  Since this gives us an even number, not an odd number,
- * we discard one case and compensate.  That is, a variance of 1024 would
- * give us offsets in [0..1023].  Instead, we take offsets in [1..1023].
- * This is symmetric about the point 512, or statvar/2, and thus averages
- * to that value (assuming uniform random numbers).
- */
-/* XXX fix comment to match value */
-int statvar = 8192;
-int statmin;   /* statclock interval - 1/2*variance */
-
-int
-glxclk_match(struct device *parent, void *match, void *aux)
-{
-   struct glxpcib_attach_args *gaa = aux;
-   struct cfdata *cf = match;
-
-   if (strcmp(gaa->gaa_name, cf->cf_driver->cd_name) != 0)
-   return 0;
-
-   return 1;
-}
-
-void
-glxclk_attach(struct device 

all platforms, kernel: remove __HAVE_CLOCKINTR symbol

2023-07-01 Thread Scott Cheloha
Every platform made the clockintr switch six months ago or more.  The
__HAVE_CLOCKINTR symbol is now redundant and can be removed.

ok?

Index: ./ddb/db_command.c
===
RCS file: /cvs/src/sys/ddb/db_command.c,v
retrieving revision 1.98
diff -u -p -r1.98 db_command.c
--- ./ddb/db_command.c  8 Mar 2023 04:43:07 -   1.98
+++ ./ddb/db_command.c  2 Jul 2023 01:34:00 -
@@ -579,9 +579,7 @@ db_bcstats_print_cmd(db_expr_t addr, int
 const struct db_command db_show_all_cmds[] = {
{ "procs",  db_show_all_procs,  0, NULL },
{ "callout",db_show_callout,0, NULL },
-#ifdef __HAVE_CLOCKINTR
{ "clockintr",  db_show_all_clockintr,  0, NULL },
-#endif
{ "pools",  db_show_all_pools,  0, NULL },
{ "mounts", db_show_all_mounts, 0, NULL },
{ "vnodes", db_show_all_vnodes, 0, NULL },
Index: ./ddb/db_interface.h
===
RCS file: /cvs/src/sys/ddb/db_interface.h,v
retrieving revision 1.25
diff -u -p -r1.25 db_interface.h
--- ./ddb/db_interface.h5 Nov 2022 19:29:45 -   1.25
+++ ./ddb/db_interface.h2 Jul 2023 01:34:00 -
@@ -44,9 +44,7 @@ void db_kill_cmd(db_expr_t, int, db_expr
 void db_show_all_procs(db_expr_t, int, db_expr_t, char *);
 
 /* kern/kern_clockintr.c */
-#ifdef __HAVE_CLOCKINTR
 void db_show_all_clockintr(db_expr_t, int, db_expr_t, char *);
-#endif
 
 /* kern/kern_timeout.c */
 void db_show_callout(db_expr_t, int, db_expr_t, char *);
Index: ./kern/kern_clockintr.c
===
RCS file: /cvs/src/sys/kern/kern_clockintr.c,v
retrieving revision 1.26
diff -u -p -r1.26 kern_clockintr.c
--- ./kern/kern_clockintr.c 2 Jul 2023 00:55:18 -   1.26
+++ ./kern/kern_clockintr.c 2 Jul 2023 01:34:00 -
@@ -29,8 +29,6 @@
 #include 
 #include 
 
-#ifdef __HAVE_CLOCKINTR
-
 /*
  * Protection for global variables in this file:
  *
@@ -773,4 +771,3 @@ db_show_clockintr(const struct clockintr
 }
 
 #endif /* DDB */
-#endif /*__HAVE_CLOCKINTR */
Index: ./kern/kern_sysctl.c
===
RCS file: /cvs/src/sys/kern/kern_sysctl.c,v
retrieving revision 1.415
diff -u -p -r1.415 kern_sysctl.c
--- ./kern/kern_sysctl.c21 May 2023 12:47:54 -  1.415
+++ ./kern/kern_sysctl.c2 Jul 2023 01:34:00 -
@@ -430,11 +430,9 @@ kern_sysctl_dirs(int top_name, int *name
case KERN_CPUSTATS:
return (sysctl_cpustats(name, namelen, oldp, oldlenp,
newp, newlen));
-#ifdef __HAVE_CLOCKINTR
case KERN_CLOCKINTR:
return sysctl_clockintr(name, namelen, oldp, oldlenp, newp,
newlen);
-#endif
default:
return (ENOTDIR);   /* overloaded */
}
Index: ./kern/subr_suspend.c
===
RCS file: /cvs/src/sys/kern/subr_suspend.c,v
retrieving revision 1.14
diff -u -p -r1.14 subr_suspend.c
--- ./kern/subr_suspend.c   10 Nov 2022 10:37:40 -  1.14
+++ ./kern/subr_suspend.c   2 Jul 2023 01:34:00 -
@@ -165,10 +165,9 @@ fail_suspend:
splx(s);
 
inittodr(gettime());
-#ifdef __HAVE_CLOCKINTR
clockintr_cpu_init(NULL);
clockintr_trigger();
-#endif
+
sleep_resume(v);
resume_randomness(rndbuf, rndbuflen);
 #ifdef MULTIPROCESSOR
Index: ./arch/alpha/include/_types.h
===
RCS file: /cvs/src/sys/arch/alpha/include/_types.h,v
retrieving revision 1.25
diff -u -p -r1.25 _types.h
--- ./arch/alpha/include/_types.h   10 Dec 2022 15:02:29 -  1.25
+++ ./arch/alpha/include/_types.h   2 Jul 2023 01:34:00 -
@@ -35,8 +35,6 @@
 #ifndef _MACHINE__TYPES_H_
 #define _MACHINE__TYPES_H_
 
-#define__HAVE_CLOCKINTR
-
 #if defined(_KERNEL)
 typedef struct label_t {
long val[10];
Index: ./arch/amd64/include/_types.h
===
RCS file: /cvs/src/sys/arch/amd64/include/_types.h,v
retrieving revision 1.18
diff -u -p -r1.18 _types.h
--- ./arch/amd64/include/_types.h   8 Nov 2022 17:34:13 -   1.18
+++ ./arch/amd64/include/_types.h   2 Jul 2023 01:34:00 -
@@ -35,8 +35,6 @@
 #ifndef _MACHINE__TYPES_H_
 #define _MACHINE__TYPES_H_
 
-#define__HAVE_CLOCKINTR
-
 /*
  * _ALIGN(p) rounds p (pointer or byte index) up to a correctly-aligned
  * value for all data types (int, long, ...).   The result is an
Index: ./arch/arm/include/_types.h
===
RCS file: /cvs/src/sys/arch/arm/include/_types.h,v
retrieving revision 1.20
diff -u -p -r1.20 _types.h
--- ./arch/arm/include/_types.h 17 Jan 2023 02:27:14 -  1.20
+++ ./arch/arm/include/_types.h 2 Jul 

Re: profclock, gmonclock: new callbacks for profil(2)/GPROF statclock() code

2023-06-28 Thread Scott Cheloha
On Fri, Jun 23, 2023 at 04:31:59PM -0500, Scott Cheloha wrote:
> On Tue, Jun 20, 2023 at 08:35:11AM -0600, Theo de Raadt wrote:
> > Claudio Jeker  wrote:
> > 
> > > On Mon, Jun 19, 2023 at 06:41:14PM -0500, Scott Cheloha wrote:
> > > > > On Jun 19, 2023, at 18:07, Theo de Raadt  wrote:
> > > > > 
> > > > > ???Make sure to STOP all kernel profiling before attempting to
> > > > >suspend or hibernate your machine.  Otherwise I expect it
> > > > >will hang.
> > > > > 
> > > > > It is completely acceptable if it produces wrong results, but it must
> > > > > not hang the system.
> > > > 
> > > > The hang is present in -current, with or
> > > > without this patch.
> > > > 
> > > > I am working to figure it out.
> > > 
> > > I don't think the suspend or hibernate code has any code to disable
> > > kernel profiling. This is bad since these code paths are extremly
> > > sensitive and try not to do side-effects.
> > > 
> > > So in the suspend/hibernate code path we should disable profiling early
> > > on. It makes no sense to try to run gprof collection in those code paths.
> > 
> > Yes, that's right.
> > 
> > It will be somewhere in kern/subr_suspend.c
> > 
> > Be careful that the "stop profiling" and "restart profiling" are at the
> > correct places.  The sleep_state() function has a bunch of unrolling
> > goto's which are not 100% reflexive, so be careful.
> 
> Judging from the blinking light on my laptop, the crash is in the
> resume path.
> 
> This patch appears to fix the problem on amd64:
> 
> - Add a new guard variable, gmon_suspended,
> - Toggle gmon_suspended off at the top of sleep_state(), and
> - Toggle gmon_suspended back on at the bottom of sleep_state().
> 
> With this applied, I can suspend/resume and hibernate/unhibernate a an
> amd64/GENERIC.MP kernel w/ GPROF without issue, even if kgmon(8) has
> enabled kernel profiling and increased the effective statclock
> frequency.

Here is a cleaner version.

- Add mcount_disable(), mcount_enable() to mcount.c.  There's no
  reason to put the interfaces in subr_prof.c, we're only
  concerned with _mcount() here.

- Only call mcount_disable/mcount_enable on GPROF kernels.  DDBPROF
  does something different, it isn't relevant here.

- Sync sys/lib/libkern/mcount.c with lib/libc/gmon/mcount.c.

As with the prior patch, this seems to fix suspend/resume and
hibernate/unhibernate on amd64 GPROF kernels when kernel profiling is
activated with kgmon(8).

OK?

Index: sys/lib/libkern/mcount.c
===
RCS file: /cvs/src/sys/lib/libkern/mcount.c,v
retrieving revision 1.14
diff -u -p -r1.14 mcount.c
--- sys/lib/libkern/mcount.c11 Jan 2022 09:21:34 -  1.14
+++ sys/lib/libkern/mcount.c28 Jun 2023 23:01:13 -
@@ -33,6 +33,32 @@
 #include 
 #include 
 
+#ifdef _KERNEL
+#ifdef SUSPEND
+#include 
+
+#include/* KASSERT */
+
+volatile int mcount_disabled;
+
+void
+mcount_disable(void)
+{
+   KASSERT(CPU_IS_PRIMARY(curcpu()));
+   mcount_disabled = 1;
+   membar_producer();
+}
+
+void
+mcount_enable(void)
+{
+   KASSERT(CPU_IS_PRIMARY(curcpu()));
+   mcount_disabled = 0;
+   membar_producer();
+}
+#endif /* SUSPEND */
+#endif /* _KERNEL */
+
 /*
  * mcount is called on entry to each function compiled with the profiling
  * switch set.  _mcount(), which is declared in a machine-dependent way
@@ -63,7 +89,10 @@ _MCOUNT_DECL(u_long frompc, u_long selfp
 */
if (gmoninit == 0)
return;
-
+#ifdef SUSPEND
+   if (mcount_disabled)
+   return;
+#endif
if ((p = curcpu()->ci_gmon) == NULL)
return;
 #else
Index: sys/kern/subr_suspend.c
===
RCS file: /cvs/src/sys/kern/subr_suspend.c,v
retrieving revision 1.14
diff -u -p -r1.14 subr_suspend.c
--- sys/kern/subr_suspend.c 10 Nov 2022 10:37:40 -  1.14
+++ sys/kern/subr_suspend.c 28 Jun 2023 23:01:13 -
@@ -26,6 +26,9 @@
 #include 
 #include 
 #include 
+#ifdef GPROF
+#include 
+#endif
 #ifdef HIBERNATE
 #include 
 #endif
@@ -63,6 +66,11 @@ top:
 
if (sleep_showstate(v, sleepmode))
return EOPNOTSUPP;
+
+#ifdef GPROF
+   /* Keep everyone out of _mcount() until we have fully resumed. */
+   mcount_disable();
+#endif
 #if NWSDISPLAY > 0
wsdisplay_suspend();
 #endif
@@ -193,6 +201,9 @@ fail_hiballoc:
start_periodic_resettodr();
 #if NWSDISPLAY > 0
wsdisplay_resume();
+#endif
+#ifdef GPROF
+   mcount_enable();
 #endif

Re: csh(1), ksh(1), time(1): print durations with millisecond precision

2023-06-25 Thread Scott Cheloha
On Tue, Jun 13, 2023 at 10:59:53PM -0500, Scott Cheloha wrote:
> This patch bumps the precision of durations printed by csh(1), ksh(1),
> and time(1) from centiseconds to milliseconds.  The csh(1) and ksh(1)
> builtins "time" and "times" are affected.
> 
> My thinking is:
> 
> - All practical OpenBSD platforms have a timecounter with at least
>   millisecond precision.
> 
> - It's not uncommon for people to run a custom HZ=1000 kernel.
>   At HZ=1000, the profiled user and system durations offered by
>   getrusage(2) are (arguably) millisecond precision.
> 
>   Yes, I know those numbers are profiled and thus not especially
>   trustworthy, but it's no different from the situation on a
>   HZ=100 kernel.
> 
> - The timing commands offered in other shells like bash and dash
>   provide (at least) millisecond precision.
> 
> - Centiseconds are a bit odd.  They don't align with the tidy
>   "thousands" separation typical of metric units.
> 
> - The POSIX standard for time(1) and the "times" builtin specifies
>   that the durations are formatted as a floating point value of
>   seconds, i.e. "%f".  This means millisecond precision is okay:
> 
> https://pubs.opengroup.org/onlinepubs/9699919799/utilities/time.html
> https://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html#times

One week bump.

Index: bin/csh/time.c
===
RCS file: /cvs/src/bin/csh/time.c,v
retrieving revision 1.18
diff -u -p -r1.18 time.c
--- bin/csh/time.c  8 Mar 2023 04:43:04 -   1.18
+++ bin/csh/time.c  14 Jun 2023 15:30:01 -
@@ -40,6 +40,7 @@
  * C Shell - routines handling process timing and niceing
  */
 static voidpdeltat(struct timeval *, struct timeval *);
+static voidpdelta_hms(const struct timespec *, const struct timespec *);
 
 void
 settimes(void)
@@ -145,7 +146,7 @@ prusage(struct rusage *r0, struct rusage
break;
 
case 'E':   /* elapsed (wall-clock) time */
-   pcsecs((long) ms);
+   pdelta_hms(e, b);
break;
 
case 'P':   /* percent time spent running */
@@ -227,8 +228,7 @@ pdeltat(struct timeval *t1, struct timev
 struct timeval td;
 
 timersub(t1, t0, );
-(void) fprintf(cshout, "%lld.%01ld", (long long)td.tv_sec,
-   td.tv_usec / 10);
+fprintf(cshout, "%lld.%03ld", (long long)td.tv_sec, td.tv_usec / 1000);
 }
 
 #define  P2DIG(i) (void) fprintf(cshout, "%d%d", (i) / 10, (i) % 10)
@@ -254,23 +254,18 @@ minsec:
 }
 
 void
-pcsecs(long l) /* PWP: print mm:ss.dd, l is in sec*100 */
+pdelta_hms(const struct timespec *t1, const struct timespec *t0)
 {
-int i;
+struct timespec elapsed;
+long long hours, minutes, seconds;
 
-i = l / 36;
-if (i) {
-   (void) fprintf(cshout, "%d:", i);
-   i = (l % 36) / 100;
-   P2DIG(i / 60);
-   goto minsec;
-}
-i = l / 100;
-(void) fprintf(cshout, "%d", i / 60);
-minsec:
-i %= 60;
-(void) fputc(':', cshout);
-P2DIG(i);
-(void) fputc('.', cshout);
-P2DIG((int) (l % 100));
+timespecsub(t1, t0, );
+hours = elapsed.tv_sec / 3600;
+minutes = (elapsed.tv_sec % 3600) / 60;
+seconds = elapsed.tv_sec % 60;
+if (hours != 0)
+   fprintf(cshout, "%lld:%02lld:", hours, minutes);
+else
+   fprintf(cshout, "%lld:", minutes);
+fprintf(cshout, "%02lld.%03ld", seconds, elapsed.tv_nsec / 100);
 }
Index: bin/ksh/c_sh.c
===
RCS file: /cvs/src/bin/ksh/c_sh.c,v
retrieving revision 1.64
diff -u -p -r1.64 c_sh.c
--- bin/ksh/c_sh.c  22 May 2020 07:50:07 -  1.64
+++ bin/ksh/c_sh.c  14 Jun 2023 15:30:01 -
@@ -681,13 +681,13 @@ p_tv(struct shf *shf, int posix, struct 
 char *suffix)
 {
if (posix)
-   shf_fprintf(shf, "%s%*lld.%02ld%s", prefix ? prefix : "",
-   width, (long long)tv->tv_sec, tv->tv_usec / 1, suffix);
+   shf_fprintf(shf, "%s%*lld.%03ld%s", prefix ? prefix : "",
+   width, (long long)tv->tv_sec, tv->tv_usec / 1000, suffix);
else
-   shf_fprintf(shf, "%s%*lldm%02lld.%02lds%s", prefix ? prefix : 
"",
+   shf_fprintf(shf, "%s%*lldm%02lld.%03lds%s", prefix ? prefix : 
"",
width, (long long)tv->tv_sec / 60,
(long long)tv->tv_sec % 60,
-   tv->tv_usec / 1, suffix);
+   tv->tv_usec / 1000, suffix);
 }
 
 static void
@@ -695,14 +695,14 @@ p_ts(struct shf *shf, int posix, struct 
 char *su

Re: profclock, gmonclock: new callbacks for profil(2)/GPROF statclock() code

2023-06-23 Thread Scott Cheloha
On Tue, Jun 20, 2023 at 08:35:11AM -0600, Theo de Raadt wrote:
> Claudio Jeker  wrote:
> 
> > On Mon, Jun 19, 2023 at 06:41:14PM -0500, Scott Cheloha wrote:
> > > > On Jun 19, 2023, at 18:07, Theo de Raadt  wrote:
> > > > 
> > > > ???Make sure to STOP all kernel profiling before attempting to
> > > >suspend or hibernate your machine.  Otherwise I expect it
> > > >will hang.
> > > > 
> > > > It is completely acceptable if it produces wrong results, but it must
> > > > not hang the system.
> > > 
> > > The hang is present in -current, with or
> > > without this patch.
> > > 
> > > I am working to figure it out.
> > 
> > I don't think the suspend or hibernate code has any code to disable
> > kernel profiling. This is bad since these code paths are extremly
> > sensitive and try not to do side-effects.
> > 
> > So in the suspend/hibernate code path we should disable profiling early
> > on. It makes no sense to try to run gprof collection in those code paths.
> 
> Yes, that's right.
> 
> It will be somewhere in kern/subr_suspend.c
> 
> Be careful that the "stop profiling" and "restart profiling" are at the
> correct places.  The sleep_state() function has a bunch of unrolling
> goto's which are not 100% reflexive, so be careful.

Judging from the blinking light on my laptop, the crash is in the
resume path.

This patch appears to fix the problem on amd64:

- Add a new guard variable, gmon_suspended,
- Toggle gmon_suspended off at the top of sleep_state(), and
- Toggle gmon_suspended back on at the bottom of sleep_state().

With this applied, I can suspend/resume and hibernate/unhibernate a an
amd64/GENERIC.MP kernel w/ GPROF without issue, even if kgmon(8) has
enabled kernel profiling and increased the effective statclock
frequency.

Index: sys/kern/subr_prof.c
===
RCS file: /cvs/src/sys/kern/subr_prof.c,v
retrieving revision 1.35
diff -u -p -r1.35 subr_prof.c
--- sys/kern/subr_prof.c2 Jun 2023 17:44:29 -   1.35
+++ sys/kern/subr_prof.c23 Jun 2023 21:25:51 -
@@ -34,6 +34,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -59,6 +60,31 @@ int gmoninit = 0;
 u_int gmon_cpu_count;  /* [K] number of CPUs with profiling enabled */
 
 extern char etext[];
+
+/*
+ * The suspend and hibernate paths need to be free
+ * of side effects.  Keep all CPUs out of _mcount()
+ * while a suspend/resume is ongoing.
+ */
+#ifdef SUSPEND
+volatile int gmon_suspended;
+
+void
+prof_resume(void)
+{
+   KASSERT(CPU_IS_PRIMARY(curcpu()));
+   gmon_suspended = 0;
+   membar_producer();
+}
+
+void
+prof_suspend(void)
+{
+   KASSERT(CPU_IS_PRIMARY(curcpu()));
+   gmon_suspended = 1;
+   membar_producer();
+}
+#endif /* SUSPEND */
 
 void
 prof_init(void)
Index: sys/kern/subr_suspend.c
===
RCS file: /cvs/src/sys/kern/subr_suspend.c,v
retrieving revision 1.14
diff -u -p -r1.14 subr_suspend.c
--- sys/kern/subr_suspend.c 10 Nov 2022 10:37:40 -  1.14
+++ sys/kern/subr_suspend.c 23 Jun 2023 21:25:51 -
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -63,6 +64,11 @@ top:
 
if (sleep_showstate(v, sleepmode))
return EOPNOTSUPP;
+
+#if defined(GPROF) || defined(DDBPROF)
+   prof_suspend();
+#endif
+
 #if NWSDISPLAY > 0
wsdisplay_suspend();
 #endif
@@ -193,6 +199,9 @@ fail_hiballoc:
start_periodic_resettodr();
 #if NWSDISPLAY > 0
wsdisplay_resume();
+#endif
+#if defined(GPROF) || defined(DDBPROF)
+   prof_resume();
 #endif
sys_sync(curproc, NULL, NULL);
if (cpu_setperf != NULL)
Index: sys/sys/gmon.h
===
RCS file: /cvs/src/sys/sys/gmon.h,v
retrieving revision 1.9
diff -u -p -r1.9 gmon.h
--- sys/sys/gmon.h  11 Jan 2022 23:59:55 -  1.9
+++ sys/sys/gmon.h  23 Jun 2023 21:25:51 -
@@ -158,6 +158,13 @@ struct gmonparam {
 #ifdef _KERNEL
 extern int gmoninit;   /* Is the kernel ready for being profiled? */
 
+#ifdef SUSPEND
+extern volatile int gmon_suspended;/* Ongoing suspend/resume? */
+
+void prof_resume(void);
+void prof_suspend(void);
+#endif
+
 #else /* !_KERNEL */
 
 #include 
Index: sys/lib/libkern/mcount.c
===
RCS file: /cvs/src/sys/lib/libkern/mcount.c,v
retrieving revision 1.14
diff -u -p -r1.14 mcount.c
--- sys/lib/libkern/mcount.c11 Jan 2022 09:21:34 -  1.14
+++ sys/lib/libkern/mcount.c23 Jun 2023 21:25:51 -
@@ -64,6 +64,10 @@ _MCOUNT

Re: uvm_meter: improve periodic execution logic for uvm_loadav()

2023-06-20 Thread Scott Cheloha
On Tue, Jun 20, 2023 at 11:47:10AM +0200, Claudio Jeker wrote:
> On Mon, Jun 19, 2023 at 04:45:03PM -0500, Scott Cheloha wrote:
> 
> [...]
> 
> > Index: uvm/uvm_meter.c
> > ===
> > RCS file: /cvs/src/sys/uvm/uvm_meter.c,v
> > retrieving revision 1.42
> > diff -u -p -r1.42 uvm_meter.c
> > --- uvm/uvm_meter.c 28 Dec 2020 14:01:23 -  1.42
> > +++ uvm/uvm_meter.c 19 Jun 2023 21:35:22 -
> > @@ -42,6 +42,7 @@
> >  #include 
> >  #include 
> >  #include 
> > +#include 
> >  #include 
> >  #include 
> >  #include 
> > @@ -65,6 +66,9 @@
> >  int maxslp = MAXSLP;   /* patchable ... */
> >  struct loadavg averunnable;
> >  
> > +/* Update load averages every five seconds. */
> > +#define UVM_METER_INTVL5
> > +
> >  /*
> >   * constants for averages over 1, 5, and 15 minutes when sampling at
> >   * 5 second intervals.
> > @@ -78,17 +82,29 @@ static fixpt_t cexp[3] = {
> >  
> >  
> >  static void uvm_loadav(struct loadavg *);
> > +void uvm_meter(void *);
> >  void uvm_total(struct vmtotal *);
> >  void uvmexp_read(struct uvmexp *);
> >  
> > +void
> > +uvm_meter_start(void)
> > +{
> > +   static struct timeout to = TIMEOUT_INITIALIZER(uvm_meter, );
> > +
> > +   uvm_meter();
> > +}
> > +
> >  /*
> >   * uvm_meter: calculate load average and wake up the swapper (if needed)
> >   */
> >  void
> > -uvm_meter(void)
> > +uvm_meter(void *arg)
> >  {
> > -   if ((gettime() % 5) == 0)
> > -   uvm_loadav();
> > +   struct timeout *to = arg;
> > +
> > +   timeout_add_sec(to, UVM_METER_INTVL);
> > +
> > +   uvm_loadav();
> > if (proc0.p_slptime > (maxslp / 2))
> > wakeup();
> >  }
> 
> Why add uvm_meter_start() using a static global value and then pass that
> value around. This code could just be:
> 
> struct timeout uvm_meter_to = TIMEOUT_INITIALIZER(uvm_meter, NULL);
> 
> void
> uvm_meter(void *arg)
> {
>   timeout_add_sec(_meter_to, UVM_METER_INTVL);
>   uvm_loadav();
> }
> 
> and then just call uvm_meter() once in scheduler_start().
> I don't understand why all extra this indirection is needed it does not
> make the code better..
> 
> Apart from that and the fact that the proc0 wakeup and go I'm OK with this
> diff.

I like that better.  I'll commit the attached tomorrow unless I hear
otherwise.

Index: share/man/man9/uvm_init.9
===
RCS file: /cvs/src/share/man/man9/uvm_init.9,v
retrieving revision 1.5
diff -u -p -r1.5 uvm_init.9
--- share/man/man9/uvm_init.9   21 May 2023 05:11:38 -  1.5
+++ share/man/man9/uvm_init.9   20 Jun 2023 15:20:59 -
@@ -168,7 +168,7 @@ argument is ignored.
 .Ft void
 .Fn uvm_kernacc "caddr_t addr" "size_t len" "int rw"
 .Ft void
-.Fn uvm_meter
+.Fn uvm_meter "void *"
 .Ft int
 .Fn uvm_sysctl "int *name" "u_int namelen" "void *oldp" "size_t *oldlenp" 
"void *newp " "size_t newlen" "struct proc *p"
 .Ft int
@@ -212,7 +212,7 @@ access, in the kernel address space.
 .Pp
 The
 .Fn uvm_meter
-function calculates the load average and wakes up the swapper if necessary.
+function periodically recomputes the load average.
 .Pp
 The
 .Fn uvm_sysctl
Index: sys/kern/sched_bsd.c
===
RCS file: /cvs/src/sys/kern/sched_bsd.c,v
retrieving revision 1.74
diff -u -p -r1.74 sched_bsd.c
--- sys/kern/sched_bsd.c4 Feb 2023 19:33:03 -   1.74
+++ sys/kern/sched_bsd.c20 Jun 2023 15:20:59 -
@@ -234,7 +234,6 @@ schedcpu(void *arg)
}
SCHED_UNLOCK(s);
}
-   uvm_meter();
wakeup();
timeout_add_sec(to, 1);
 }
@@ -669,6 +668,7 @@ scheduler_start(void)
 
rrticks_init = hz / 10;
schedcpu(_to);
+   uvm_meter(NULL);
 
 #ifndef SMALL_KERNEL
if (perfpolicy == PERFPOL_AUTO)
Index: sys/uvm/uvm_meter.c
===
RCS file: /cvs/src/sys/uvm/uvm_meter.c,v
retrieving revision 1.42
diff -u -p -r1.42 uvm_meter.c
--- sys/uvm/uvm_meter.c 28 Dec 2020 14:01:23 -  1.42
+++ sys/uvm/uvm_meter.c 20 Jun 2023 15:20:59 -
@@ -65,6 +65,9 @@
 int maxslp = MAXSLP;   /* patchable ... */
 struct loadavg averunnable;
 
+#define UVM_METER_INTVL5
+struct timeout uvm_meter_to = TIMEOUT_INITIALIZER(uvm_meter, NULL);
+
 /*
  * constants for averages over 1, 5, and 15 minutes when sampling at
  * 

Re: profclock, gmonclock: new callbacks for profil(2)/GPROF statclock() code

2023-06-19 Thread Scott Cheloha
> On Jun 19, 2023, at 18:07, Theo de Raadt  wrote:
> 
> Make sure to STOP all kernel profiling before attempting to
>suspend or hibernate your machine.  Otherwise I expect it
>will hang.
> 
> It is completely acceptable if it produces wrong results, but it must
> not hang the system.

The hang is present in -current, with or
without this patch.

I am working to figure it out.



Re: profclock, gmonclock: new callbacks for profil(2)/GPROF statclock() code

2023-06-19 Thread Scott Cheloha
On Mon, Jun 19, 2023 at 05:40:04PM -0500, Scott Cheloha wrote:
> This patch moves the profil(2)- and GPROF-specific parts of
> statclock() out into their own dedicated clock interrupts.
> 
> Test instructions will follow in a reply to this mail.  This needs
> testing on every platform.  Please reply with dmesgs and results.
> Non-amd64 results are greatly appreciated.
> 
> [...]

Okay, here are the instructions.

The profclock() piece should have no negative interactions with
suspend/resume or hibernate/unhibernate, so even if you don't want to
actively test the patch, passively testing the patch by leaving it
applied to the kernel on your daily driver will help.

That said, I want to stress the following:

Suspend/resume and hibernate/unhibernate are NOT compatible with
GPROF profiling.  I assume there are many untested interactions
between the two.

Make sure to STOP all kernel profiling before attempting to
suspend or hibernate your machine.  Otherwise I expect it
will hang.

GPROF is a kernel compile-time option.  If you don't enable it,
you have nothing to worry about.

--

To actively test either the profclock or the GPROF parts, first do the
following:

1. Download and apply the work-in-progress moncontrol/gprof patch.
   The patch will make it much easier to test the profclock() parts.
   I sent the patch to tech@ recently:

From: Scott Cheloha 
Date: Mon, 19 Jun 2023 03:25:37 +
Subject: moncontrol(3), gprof(1): write, read gmon profiling data via utrace(2)

   Or get it from a list archive:

$ PATCH_URL='https://marc.info/?l=openbsd-tech=168714523619637=raw'
$ PATCH_PATH=/tmp/moncontrol-gprof-utrace.patch2
$ ftp -o $PATCH_PATH $PATCH_URL
$ cd /usr/src
$ patch -C < $PATCH_PATH && patch < $PATCH_PATH

2. Rebuild and reinstall both libc and gprof(1).

$ cd /usr/src/lib/libc && doas make clean && doas make && doas make install
$ cd /usr/src/usr.bin/gprof && doas make clean && doas make && doas make install

--

To test the profclock() parts:

1. In one terminal, start systat(1).  We'll use it to observe the
   clock interrupt rate.  Note the baseline clock interrupt rate.
   It is usually about ((Number-of-online-CPUs) x 200).

$ systat vmstat 2

2. Check your kernel's profhz.  It's normally 1000, though there are
   exceptions.

$ sysctl -n kern.clockrate
tick = 1, hz = 100, profhz = 1000, stathz = 100
^

3. In a second terminal, build a profiling binary, then run it with
   ktrace(1) to capture the profiling data.  I will use md5(1) here
   because it is easy to eyeball-test.

$ cd /usr/src/bin/md5
$ doas make clean
$ doas make CFLAGS=-pg LDFLAGS='-pg -static'
$ command time ktrace -f ktrace.md5.out1 -tu obj/md5 -ttt

4. Observe the clock interrupt rate in the first terminal.  When the
   profiled process is onproc, the rate should increase by the profhz
   value from step (2).  For example, if your baseline rate is ~1600
   and your kernel's profhz is 1000, the rate should increase to ~2600.

   The md5(1) time trial is mostly userspace computation.  On an otherwise
   idle system the clock interrupt rate should *stay* at that increased
   rate until the program exits.

   When the program exits, the clock interrupt rate should immediately
   return to the baseline rate from step (1).

5. Check the results with your patched gprof(1).  If you're familiar
   with gprof(1)'s output, check that the output is reasonable.

$ gprof obj/md5 ktrace.md5.out1 | less

--

To test the GPROF part:

1. Build and install a GPROF kernel.  This is briefly described in
   options(4).

   In short: first add the following to conf/GENERIC:

makeoptions PROF="-pg"
option  GPROF

   Second: remake your kernel config.  Last: clean, rebuild, install,
   and reboot.

2. After rebooting, you can double-check you're running a GPROF kernel
   by looking for "Profiling kernel" in your dmesg.

$ dmesg | fgrep 'Profiling kernel'
Profiling kernel, textsize=33858968 [8000..8204a598]

3. In one terminal start systat(1) as described above for the profclock()
   testing in steps (1) and (2).  Note the baseline interrupt rate, note
   your kernel's profhz value, etc.

4. In a second terminal, profile a CPU with kgmon(8).  I'm going to
   use CPU 2 in this example.

$ command time doas ksh -c "kgmon -c 2 -b && sleep 10 ; doas kgmon -c 2 -h -pr"
kgmon: kernel profiling is running for cpu 2.
kgmon: kernel profiling is off for cpu 2.
   10.089 real 0.000 user 0.060 sys

4. Observe the clock interrupt rate in the first terminal.

   The rate should increase by the profhz value from step (2).  For
   example, if your baseline rate is ~1600 and your kernel's profhz
   is 1000, the rate should increase to ~2600.  A profiled CPU is
   always "running"

profclock, gmonclock: new callbacks for profil(2)/GPROF statclock() code

2023-06-19 Thread Scott Cheloha
This patch moves the profil(2)- and GPROF-specific parts of
statclock() out into their own dedicated clock interrupts.

Test instructions will follow in a reply to this mail.  This needs
testing on every platform.  Please reply with dmesgs and results.
Non-amd64 results are greatly appreciated.

Anyway, back to the patch.  There are several upsides to breaking the
statclock() apart:

- It eliminates the concept of an "effective statclock rate", a
  clever hack introduced in the early '90s to accomodate now-ancient
  clock interrupt hardware.  We don't need an effective statclock
  rate.  The complexity it introduces is pointless and buys us
  nothing.

- It simplifies statclock(9).  This makes statclock() easier
  to change to suit e.g. imminent scheduler changes.

  In particular, this patch simplifies or deletes a ton of
  clock interrupt code dedicated to managing the "effective
  statclock rate".

- It allows us to enable/disable the profil(2) and GPROF interrupts on
  a per-CPU basis.  This dramatically reduces the global cost of profil(2)
  and kgmon(8) on the system: only the relevant CPU sees an increased
  interrupt rate.  In particular, the profil(2) interrupt is only active
  when a thread of interest is actually running.

Shortlist of changes:

- Move the profil(2)-specific code from statclock() into a new routine,
  profclock(), in subr_prof.c.  The profclock() clockintr is established
  during sched_init_cpu() for each CPU's scheduler.

- The profclock() is toggled on during mi_switch() when a thread in a
  profiled process runs.  It is toggled off during mi_switch() or
  sched_exit() when that thread yields.  The scheduler tracks the
  state of the profclock interrupt with a new flag, SPCF_PROFCLOCK.

  profil(2) is not normally running, so these changes should not
  change the overhead of mi_switch().  Under normal circumstances,
  these new branches are cold.

- profil(2) calls need_resched() to force the profclock() on or
  off when the profiling state of a process changes.

- Move the GPROF-specific code from statclock() into a new routine,
  gmonclock(), in subr_prof.c  On GPROF kernels, the  gmonclock()
  clockintr is established during prof_init() for each CPU.  Failure
  to establish the gmonclock interrupt does not panic the kernel;
  this mirrors the current behavior.

- gmonclock() is toggled on and off from sysctl_doprof().

- Delete all code and state related to the "effective statclock rate".
  We can throw out:

  + psdiv, pscnt, and psratio.
  + spc_pscnt, spc_psdiv, and all related code in statclock().
  + stat_*, prof_*, clockintr_mtx, clockintr_setstatclockrate(), and
the lockless read loop in clockintr_statclock().

- The new clock interrupts have an execution period of profclock_period
  nanoseconds, which is set during initclocks().  There is no longer any
  material reason to require that profhz be an even multiple of stathz.
  We can delete the KASSERT in a separate patch.

- On amd64, changing the size of schedstate_percpu changes the alignment
  of certain things in cpu_info and breaks compilation.  I have done what
  I can to get it to compile again.

  I need help rearranging amd64's cpu_info to avoid upsetting any
  delicate cacheline groupings.

Index: kern/kern_clock.c
===
RCS file: /cvs/src/sys/kern/kern_clock.c,v
retrieving revision 1.108
diff -u -p -r1.108 kern_clock.c
--- kern/kern_clock.c   25 Apr 2023 00:58:47 -  1.108
+++ kern/kern_clock.c   19 Jun 2023 22:36:54 -
@@ -49,10 +49,6 @@
 #include 
 #include 
 
-#if defined(GPROF) || defined(DDBPROF)
-#include 
-#endif
-
 #include "dt.h"
 #if NDT > 0
 #include 
@@ -87,8 +83,6 @@ int   schedhz;
 intprofhz;
 intprofprocs;
 intticks = INT_MAX - (15 * 60 * HZ);
-static int psdiv, pscnt;   /* prof => stat divider */
-intpsratio;/* ratio: prof / stat */
 
 volatile unsigned long jiffies = ULONG_MAX - (10 * 60 * HZ);
 
@@ -99,16 +93,13 @@ void
 initclocks(void)
 {
/*
-* Set divisors to 1 (normal case) and let the machine-specific
-* code do its bit.
+* Let the machine-specific code do its bit.
 */
-   psdiv = pscnt = 1;
cpu_initclocks();
 
-   /*
-* Compute profhz/stathz.
-*/
-   psratio = profhz / stathz;
+   KASSERT(profhz >= stathz && profhz <= 10);
+   KASSERT(profhz % stathz == 0);
+   profclock_period = 10 / profhz;
 
inittimecounter();
 }
@@ -256,7 +247,6 @@ startprofclock(struct process *pr)
atomic_setbits_int(>ps_flags, PS_PROFIL);
if (++profprocs == 1) {
s = splstatclock();
-   psdiv = pscnt = psratio;
setstatclockrate(profhz);
splx(s);
}
@@ -275,7 +265,6 @@ stopprofclock(struct process *pr)

Re: uvm_meter: improve periodic execution logic for uvm_loadav()

2023-06-19 Thread Scott Cheloha
On Mon, Jun 19, 2023 at 10:22:56AM +0200, Claudio Jeker wrote:
> On Sun, Jun 18, 2023 at 12:43:18PM -0500, Scott Cheloha wrote:
> > On Sun, Jun 18, 2023 at 12:36:07PM -0500, Scott Cheloha wrote:
> > > On Sun, Jun 18, 2023 at 07:32:56PM +0200, Mark Kettenis wrote:
> > > > > Date: Sun, 18 Jun 2023 12:27:17 -0500
> > > > > From: Scott Cheloha 
> > > > > 
> > > > > The intent here is to update the load averages every five seconds.
> > > > > However:
> > > > > 
> > > > > 1. Measuring elapsed time with the UTC clock is unreliable because of
> > > > >settimeofday(2).
> > > > > 
> > > > > 2. "Call uvm_loadav() no more than once every five seconds", is not
> > > > > equivalent to "call uvm_loadav() if the current second is equal
> > > > > to zero, modulo five".
> > > > > 
> > > > >Not hard to imagine edge cases where timeouts are delayed and
> > > > >the load averages are not updated.
> > > > > 
> > > > > So, (1) use the monotonic clock, and (2) keep the next uvm_loadav()
> > > > > call time in a static value.
> > > > > 
> > > > > ok?
> > > > 
> > > > I really don't see why the calculatin of something vague like the load
> > > > average warrants complicating the code like this.
> > > 
> > > Aren't load averages used to make decisions about thread placement in
> > > the scheduler?
> > > 
> > > Regardless, the code is still wrong.  At minimum you should use
> > > getuptime(9).
> > 
> > Maybe I misunderstood.  Are you suggesting this?
> > 
> > 
> > now = getuptime();
> > if (now >= next_uvm_loadav) {
> > next_uvm_loadav = now + 5;
> > uvm_loadav(...);
> > }
> > 
> > The patch I posted preserves the current behavior.  It is equivalent
> > to:
> > 
> > while (next_uvm_loadav <= now)
> > next_uvm_loadav += 5;
> > 
> > Changing it to (now + 5) changes the behavior.
> 
> To be honest, I think the uvm_meter should be called via timeout(9) and
> not be called via the current path using schedcpu().
> At some point schedcpu() may be removed and then we need to fix this
> proper anyway.

See attached.

This changes the wakeup interval for the proc0 (the swapper) on
patched kernels where maxslp is patched (highly unusual).

On default kernels, maxslp is 20, which divides evenly into 5, so
the patch does not change the proc0 wakeup interval.

Another approach would be to run the uvm_meter() timeout every second
and track the uvm_loadav() deadline in a static variable.

> Running the lbolt wakeup inside schedcpu() has the same issue.

Eliminating lbolt has been a goal of mine for several years.  Tried
to remove as many users as possible a few years ago.  The tricky cases
are in the sys/kern tty code.

--

Index: kern/sched_bsd.c
===
RCS file: /cvs/src/sys/kern/sched_bsd.c,v
retrieving revision 1.74
diff -u -p -r1.74 sched_bsd.c
--- kern/sched_bsd.c4 Feb 2023 19:33:03 -   1.74
+++ kern/sched_bsd.c19 Jun 2023 21:35:22 -
@@ -234,7 +234,6 @@ schedcpu(void *arg)
}
SCHED_UNLOCK(s);
}
-   uvm_meter();
wakeup();
timeout_add_sec(to, 1);
 }
@@ -669,6 +668,7 @@ scheduler_start(void)
 
rrticks_init = hz / 10;
schedcpu(_to);
+   uvm_meter_start();
 
 #ifndef SMALL_KERNEL
if (perfpolicy == PERFPOL_AUTO)
Index: uvm/uvm_meter.c
===
RCS file: /cvs/src/sys/uvm/uvm_meter.c,v
retrieving revision 1.42
diff -u -p -r1.42 uvm_meter.c
--- uvm/uvm_meter.c 28 Dec 2020 14:01:23 -  1.42
+++ uvm/uvm_meter.c 19 Jun 2023 21:35:22 -
@@ -42,6 +42,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -65,6 +66,9 @@
 int maxslp = MAXSLP;   /* patchable ... */
 struct loadavg averunnable;
 
+/* Update load averages every five seconds. */
+#define UVM_METER_INTVL5
+
 /*
  * constants for averages over 1, 5, and 15 minutes when sampling at
  * 5 second intervals.
@@ -78,17 +82,29 @@ static fixpt_t cexp[3] = {
 
 
 static void uvm_loadav(struct loadavg *);
+void uvm_meter(void *);
 void uvm_total(struct vmtotal *);
 void uvmexp_read(struct uvmexp *);
 
+void
+uvm_meter_start(void)
+{
+   static struct timeout to = TIMEOUT_INITIALIZER(uvm_meter, );
+
+   uvm_meter();
+}
+
 /*
  * uvm_meter: calculate load average and wake up the swapper (if needed)
  *

moncontrol(3), gprof(1): write, read gmon profiling data via utrace(2)

2023-06-18 Thread Scott Cheloha
+FILE   *ktrace, *pfile;
 intsize;
 intrate;
 
-if((pfile = fopen(filename, "r")) == NULL)
-   err(1, "fopen: %s", filename);
+if (gflag) {
+   if ((pfile = fopen(filename, "r")) == NULL)
+   err(1, "fopen: %s", filename);
+} else {
+   ktrace = fopen(filename, "r");
+   if (ktrace == NULL)
+   err(1, "fopen: %s", filename);
+   pfile = ktrace_extract(ktrace, filename);
+   if (pfile == NULL)
+   errx(1, "%s: ktrace extraction failed", filename);
+   if (fclose(ktrace) == EOF)
+   err(1, "fclose: %s", filename);
+}
 if (fread(, sizeof(struct gmonhdr), 1, pfile) != 1)
errx(1, "%s: bad gmon header", filename);
 if ( s_highpc != 0 && ( tmp.lpc != gmonhdr.lpc ||
Index: usr.bin/gprof/extract.c
===
RCS file: usr.bin/gprof/extract.c
diff -N usr.bin/gprof/extract.c
--- /dev/null   1 Jan 1970 00:00:00 -
+++ usr.bin/gprof/extract.c 19 Jun 2023 03:15:15 -
@@ -0,0 +1,317 @@
+/* $OpenBSD$   */
+/*
+ * Copyright (c) 2023 Sebastien Marie 
+ * Copyright (c) 2023 Scott Cheloha 
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include 
+#include 
+#include 
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define GMON_LABEL "_openbsd_libc_gmon"
+#define GMON_LABEL_LEN (sizeof(GMON_LABEL) - 1)
+
+/*
+ * A rudimentary gmon.out deserialization state machine.
+ * Allows for basic error-checking and the detection of an
+ * incomplete record set.
+ */
+enum gmon_state {
+   HEADER,
+   KCOUNT,
+   RAWARC,
+   FOOTER,
+   ERROR
+};
+
+struct gmon_de {
+   size_t sample_count;/* kcount array: current sample count */ 
+   size_t sample_total;/* kcount array: total samples in array */
+   enum gmon_state state;  /* gmon.out deserialization step */
+};
+
+void de_warnx(const char *, const char *, ...);
+void gmon_append(FILE *, const char *, struct gmon_de *, const char *, char *);
+int ktrace_header(FILE *, struct ktr_header *);
+int ktrace_next(FILE *, const char *, struct ktr_header *, void **, size_t *);
+
+FILE *
+ktrace_extract(FILE *kfp, const char *ktrace_path)
+{
+   struct _user_trace {
+   struct ktr_user hdr;
+   char buf[KTR_USER_MAXLEN + 1];  /* +1 for NUL */
+   } *user_trace;
+   char temp_path[32];
+   struct gmon_de de = { .state = HEADER };
+   struct ktr_header header = { 0 };
+   FILE *tfp;
+   void *buf = NULL, *label;
+   size_t buf_size = 0, len;
+   int fd, have_pid = 0, saved_errno;
+   pid_t pid;
+
+   /* Deserialize moncontrol(3) records into a temporary file. */
+   len = strlcpy(temp_path, "/tmp/gmon.out.XX", sizeof temp_path);
+   assert(len < sizeof temp_path);
+   fd = mkstemp(temp_path);
+   if (fd == -1) {
+   warn("mkstemp");
+   return NULL;
+   }
+
+   /*
+* We have opened a file descriptor.  From this point on,
+* we need to to jump to "error" and clean up before returning.
+*/
+   if (unlink(temp_path) == -1) {
+   warn("unlink: %s", temp_path);
+   goto error;
+   }
+   tfp = fdopen(fd, "r+");
+   if (tfp == NULL) {
+   warn("%s", temp_path);
+   goto error;
+   }
+
+   if (ktrace_header(kfp, ) == -1) {
+   warn("%s", ktrace_path);
+   goto error;
+   }
+   if (header.ktr_type != htobe32(KTR_START)) {
+   warn("%s: not a valid ktrace file", ktrace_path);
+   goto error;
+   }
+
+   while (ktrace_next(kfp, ktrace_path, , , _size) != -1) {
+   /* Filter for utrace(2) headers with the gmon label. */
+   if (header.ktr_type != KTR_USER)
+   continue;
+   user_trace = buf;
+   label = _trace->hdr.ktr

Re: uvm_meter: improve periodic execution logic for uvm_loadav()

2023-06-18 Thread Scott Cheloha
On Sun, Jun 18, 2023 at 12:36:07PM -0500, Scott Cheloha wrote:
> On Sun, Jun 18, 2023 at 07:32:56PM +0200, Mark Kettenis wrote:
> > > Date: Sun, 18 Jun 2023 12:27:17 -0500
> > > From: Scott Cheloha 
> > > 
> > > The intent here is to update the load averages every five seconds.
> > > However:
> > > 
> > > 1. Measuring elapsed time with the UTC clock is unreliable because of
> > >settimeofday(2).
> > > 
> > > 2. "Call uvm_loadav() no more than once every five seconds", is not
> > > equivalent to "call uvm_loadav() if the current second is equal
> > > to zero, modulo five".
> > > 
> > >Not hard to imagine edge cases where timeouts are delayed and
> > >the load averages are not updated.
> > > 
> > > So, (1) use the monotonic clock, and (2) keep the next uvm_loadav()
> > > call time in a static value.
> > > 
> > > ok?
> > 
> > I really don't see why the calculatin of something vague like the load
> > average warrants complicating the code like this.
> 
> Aren't load averages used to make decisions about thread placement in
> the scheduler?
> 
> Regardless, the code is still wrong.  At minimum you should use
> getuptime(9).

Maybe I misunderstood.  Are you suggesting this?


now = getuptime();
if (now >= next_uvm_loadav) {
next_uvm_loadav = now + 5;
uvm_loadav(...);
}

The patch I posted preserves the current behavior.  It is equivalent
to:

while (next_uvm_loadav <= now)
next_uvm_loadav += 5;

Changing it to (now + 5) changes the behavior.

Index: uvm_meter.c
===
RCS file: /cvs/src/sys/uvm/uvm_meter.c,v
retrieving revision 1.42
diff -u -p -r1.42 uvm_meter.c
--- uvm_meter.c 28 Dec 2020 14:01:23 -  1.42
+++ uvm_meter.c 18 Jun 2023 17:11:09 -
@@ -87,8 +87,16 @@ void uvmexp_read(struct uvmexp *);
 void
 uvm_meter(void)
 {
-   if ((gettime() % 5) == 0)
+   static time_t next_uvm_loadav;
+   time_t intvl_count, now;
+
+   now = getuptime();
+   if (now >= next_uvm_loadav) {
+   intvl_count = (now - next_uvm_loadav) / 5 + 1;
+   next_uvm_loadav += 5 * intvl_count;
uvm_loadav();
+   }
+
if (proc0.p_slptime > (maxslp / 2))
wakeup();
 }



Re: uvm_meter: improve periodic execution logic for uvm_loadav()

2023-06-18 Thread Scott Cheloha
On Sun, Jun 18, 2023 at 07:32:56PM +0200, Mark Kettenis wrote:
> > Date: Sun, 18 Jun 2023 12:27:17 -0500
> > From: Scott Cheloha 
> > 
> > The intent here is to update the load averages every five seconds.
> > However:
> > 
> > 1. Measuring elapsed time with the UTC clock is unreliable because of
> >settimeofday(2).
> > 
> > 2. "Call uvm_loadav() no more than once every five seconds", is not
> > equivalent to "call uvm_loadav() if the current second is equal
> > to zero, modulo five".
> > 
> >Not hard to imagine edge cases where timeouts are delayed and
> >the load averages are not updated.
> > 
> > So, (1) use the monotonic clock, and (2) keep the next uvm_loadav()
> > call time in a static value.
> > 
> > ok?
> 
> I really don't see why the calculatin of something vague like the load
> average warrants complicating the code like this.

Aren't load averages used to make decisions about thread placement in
the scheduler?

Regardless, the code is still wrong.  At minimum you should use
getuptime(9).



uvm_meter: improve periodic execution logic for uvm_loadav()

2023-06-18 Thread Scott Cheloha
The intent here is to update the load averages every five seconds.
However:

1. Measuring elapsed time with the UTC clock is unreliable because of
   settimeofday(2).

2. "Call uvm_loadav() no more than once every five seconds", is not
equivalent to "call uvm_loadav() if the current second is equal
to zero, modulo five".

   Not hard to imagine edge cases where timeouts are delayed and
   the load averages are not updated.

So, (1) use the monotonic clock, and (2) keep the next uvm_loadav()
call time in a static value.

ok?

Index: uvm_meter.c
===
RCS file: /cvs/src/sys/uvm/uvm_meter.c,v
retrieving revision 1.42
diff -u -p -r1.42 uvm_meter.c
--- uvm_meter.c 28 Dec 2020 14:01:23 -  1.42
+++ uvm_meter.c 18 Jun 2023 17:11:09 -
@@ -87,8 +87,16 @@ void uvmexp_read(struct uvmexp *);
 void
 uvm_meter(void)
 {
-   if ((gettime() % 5) == 0)
+   static time_t next_uvm_loadav;
+   time_t intvl_count, now;
+
+   now = getuptime();
+   if (now >= next_uvm_loadav) {
+   intvl_count = (now - next_uvm_loadav) / 5 + 1;
+   next_uvm_loadav += 5 * intvl_count;
uvm_loadav();
+   }
+
if (proc0.p_slptime > (maxslp / 2))
wakeup();
 }



Re: Pull Request: Addition of String Data Type in /sys/sys/types.h

2023-06-18 Thread Scott Cheloha
> On Jun 18, 2023, at 02:19, Abderrahmane Ghellab 
>  wrote:
> 
> [...]
> 
> In my pull request , I made a
> simple modification to the file by adding a new line to introduce a string
> data type. Below, you can find the diff highlighting the changes made:
> 
> ```diff
> 4848   #endif
> 4949
> 5050   #if __BSD_VISIBLE
>51 + typedef char * string; /*String Data Type: Better than writing
> char* every time*/
> 5152typedef unsigned char u_char;
> 5253typedef unsigned short u_short;
> 5354typedef unsigned int u_int;
> ```
> 
> The addition of the `string` data type aims to enhance code readability and
> reduce verbosity. Instead of explicitly writing `char*` every time a string
> is declared, developers can now utilize the `string` type for convenience
> and brevity.
> 
> I believe that this modification can contribute positively to the OpenBSD
> codebase, simplifying code maintenance and making it more intuitive for
> developers. It aligns with the philosophy of OpenBSD to provide a secure,
> simple, and clean operating system.

this is a very high-quality troll, nice



Re: csh(1), ksh(1), time(1): print durations with millisecond precision

2023-06-14 Thread Scott Cheloha
On Wed, Jun 14, 2023 at 10:34:20AM -0400, Josiah Frentsos wrote:
> On Tue, Jun 13, 2023 at 10:59:53PM -0500, Scott Cheloha wrote:
> > Index: usr.bin/time/time.c
> > ===
> > RCS file: /cvs/src/usr.bin/time/time.c,v
> > retrieving revision 1.25
> > diff -u -p -r1.25 time.c
> > --- usr.bin/time/time.c 21 Aug 2017 13:38:02 -  1.25
> > +++ usr.bin/time/time.c 14 Jun 2023 03:23:29 -
> > @@ -100,19 +100,19 @@ main(int argc, char *argv[])
> > timespecsub(, , );
> >  
> > if (portableflag) {
> > -   fprintf(stderr, "real %9lld.%02ld\n",
> > -   (long long)during.tv_sec, during.tv_nsec/1000);
> > -   fprintf(stderr, "user %9lld.%02ld\n",
> > -   (long long)ru.ru_utime.tv_sec, 
> > ru.ru_utime.tv_usec/1);
> > -   fprintf(stderr, "sys  %9lld.%02ld\n",
> > -   (long long)ru.ru_stime.tv_sec, 
> > ru.ru_stime.tv_usec/1);
> > +   fprintf(stderr, "real %9lld.%03ld\n",
> > +   (long long)during.tv_sec, during.tv_nsec / 100);
> > +   fprintf(stderr, "user %9lld.%03ld\n",
> > +   (long long)ru.ru_utime.tv_sec, ru.ru_utime.tv_usec / 1000);
> > +   fprintf(stderr, "sys  %9lld.%03ld\n",
> > +   (long long)ru.ru_stime.tv_sec, ru.ru_stime.tv_usec / 1000);
> > } else {
> > -   fprintf(stderr, "%9lld.%02ld real ",
> > -   (long long)during.tv_sec, during.tv_nsec/1000);
> > -   fprintf(stderr, "%9lld.%02ld user ",
> > -   (long long)ru.ru_utime.tv_sec, 
> > ru.ru_utime.tv_usec/1);
> > -   fprintf(stderr, "%9lld.%02ld sys\n",
> > -   (long long)ru.ru_stime.tv_sec, 
> > ru.ru_stime.tv_usec/1);
> > +   fprintf(stderr, "%9lld.%03ld real ",
> > +   (long long)during.tv_sec, during.tv_nsec / 100);
> > +   fprintf(stderr, "%9lld.%0ld user ",
>
> 
> Should this be "%03ld"?

Whoops, yep, good catch.

Index: bin/csh/time.c
===
RCS file: /cvs/src/bin/csh/time.c,v
retrieving revision 1.18
diff -u -p -r1.18 time.c
--- bin/csh/time.c  8 Mar 2023 04:43:04 -   1.18
+++ bin/csh/time.c  14 Jun 2023 15:30:01 -
@@ -40,6 +40,7 @@
  * C Shell - routines handling process timing and niceing
  */
 static voidpdeltat(struct timeval *, struct timeval *);
+static voidpdelta_hms(const struct timespec *, const struct timespec *);
 
 void
 settimes(void)
@@ -145,7 +146,7 @@ prusage(struct rusage *r0, struct rusage
break;
 
case 'E':   /* elapsed (wall-clock) time */
-   pcsecs((long) ms);
+   pdelta_hms(e, b);
break;
 
case 'P':   /* percent time spent running */
@@ -227,8 +228,7 @@ pdeltat(struct timeval *t1, struct timev
 struct timeval td;
 
 timersub(t1, t0, );
-(void) fprintf(cshout, "%lld.%01ld", (long long)td.tv_sec,
-   td.tv_usec / 10);
+fprintf(cshout, "%lld.%03ld", (long long)td.tv_sec, td.tv_usec / 1000);
 }
 
 #define  P2DIG(i) (void) fprintf(cshout, "%d%d", (i) / 10, (i) % 10)
@@ -254,23 +254,18 @@ minsec:
 }
 
 void
-pcsecs(long l) /* PWP: print mm:ss.dd, l is in sec*100 */
+pdelta_hms(const struct timespec *t1, const struct timespec *t0)
 {
-int i;
+struct timespec elapsed;
+long long hours, minutes, seconds;
 
-i = l / 36;
-if (i) {
-   (void) fprintf(cshout, "%d:", i);
-   i = (l % 36) / 100;
-   P2DIG(i / 60);
-   goto minsec;
-}
-i = l / 100;
-(void) fprintf(cshout, "%d", i / 60);
-minsec:
-i %= 60;
-(void) fputc(':', cshout);
-P2DIG(i);
-(void) fputc('.', cshout);
-P2DIG((int) (l % 100));
+timespecsub(t1, t0, );
+hours = elapsed.tv_sec / 3600;
+minutes = (elapsed.tv_sec % 3600) / 60;
+seconds = elapsed.tv_sec % 60;
+if (hours != 0)
+   fprintf(cshout, "%lld:%02lld:", hours, minutes);
+else
+   fprintf(cshout, "%lld:", minutes);
+fprintf(cshout, "%02lld.%03ld", seconds, elapsed.tv_nsec / 100);
 }
Index: bin/ksh/c_sh.c
===
RCS file: /cvs/src/bin/ksh/c_sh.c,v
retrieving revision 1.64
diff -u -p -r1.64 c_sh.c
--- bin/ksh/c_sh.c  22 May 2020 07:50:07 -  1.64
+++ bin/ksh/c_sh.c  14 Jun 2023 15:3

csh(1), ksh(1), time(1): print durations with millisecond precision

2023-06-13 Thread Scott Cheloha
This patch bumps the precision of durations printed by csh(1), ksh(1),
and time(1) from centiseconds to milliseconds.  The csh(1) and ksh(1)
builtins "time" and "times" are affected.

My thinking is:

- All practical OpenBSD platforms have a timecounter with at least
  millisecond precision.

- It's not uncommon for people to run a custom HZ=1000 kernel.
  At HZ=1000, the profiled user and system durations offered by
  getrusage(2) are (arguably) millisecond precision.

  Yes, I know those numbers are profiled and thus not especially
  trustworthy, but it's no different from the situation on a
  HZ=100 kernel.

- The timing commands offered in other shells like bash and dash
  provide (at least) millisecond precision.

- Centiseconds are a bit odd.  They don't align with the tidy
  "thousands" separation typical of metric units.

- The POSIX standard for time(1) and the "times" builtin specifies
  that the durations are formatted as a floating point value of
  seconds, i.e. "%f".  This means millisecond precision is okay:

https://pubs.opengroup.org/onlinepubs/9699919799/utilities/time.html
https://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html#times

Index: bin/csh/time.c
===
RCS file: /cvs/src/bin/csh/time.c,v
retrieving revision 1.18
diff -u -p -r1.18 time.c
--- bin/csh/time.c  8 Mar 2023 04:43:04 -   1.18
+++ bin/csh/time.c  14 Jun 2023 03:23:29 -
@@ -40,6 +40,7 @@
  * C Shell - routines handling process timing and niceing
  */
 static voidpdeltat(struct timeval *, struct timeval *);
+static voidpdelta_hms(const struct timespec *, const struct timespec *);
 
 void
 settimes(void)
@@ -145,7 +146,7 @@ prusage(struct rusage *r0, struct rusage
break;
 
case 'E':   /* elapsed (wall-clock) time */
-   pcsecs((long) ms);
+   pdelta_hms(e, b);
break;
 
case 'P':   /* percent time spent running */
@@ -227,8 +228,7 @@ pdeltat(struct timeval *t1, struct timev
 struct timeval td;
 
 timersub(t1, t0, );
-(void) fprintf(cshout, "%lld.%01ld", (long long)td.tv_sec,
-   td.tv_usec / 10);
+fprintf(cshout, "%lld.%03ld", (long long)td.tv_sec, td.tv_usec / 1000);
 }
 
 #define  P2DIG(i) (void) fprintf(cshout, "%d%d", (i) / 10, (i) % 10)
@@ -254,23 +254,18 @@ minsec:
 }
 
 void
-pcsecs(long l) /* PWP: print mm:ss.dd, l is in sec*100 */
+pdelta_hms(const struct timespec *t1, const struct timespec *t0)
 {
-int i;
+struct timespec elapsed;
+long long hours, minutes, seconds;
 
-i = l / 36;
-if (i) {
-   (void) fprintf(cshout, "%d:", i);
-   i = (l % 36) / 100;
-   P2DIG(i / 60);
-   goto minsec;
-}
-i = l / 100;
-(void) fprintf(cshout, "%d", i / 60);
-minsec:
-i %= 60;
-(void) fputc(':', cshout);
-P2DIG(i);
-(void) fputc('.', cshout);
-P2DIG((int) (l % 100));
+timespecsub(t1, t0, );
+hours = elapsed.tv_sec / 3600;
+minutes = (elapsed.tv_sec % 3600) / 60;
+seconds = elapsed.tv_sec % 60;
+if (hours != 0)
+   fprintf(cshout, "%lld:%02lld:", hours, minutes);
+else
+   fprintf(cshout, "%lld:", minutes);
+fprintf(cshout, "%02lld.%03ld", seconds, elapsed.tv_nsec / 100);
 }
Index: bin/ksh/c_sh.c
===
RCS file: /cvs/src/bin/ksh/c_sh.c,v
retrieving revision 1.64
diff -u -p -r1.64 c_sh.c
--- bin/ksh/c_sh.c  22 May 2020 07:50:07 -  1.64
+++ bin/ksh/c_sh.c  14 Jun 2023 03:23:29 -
@@ -681,13 +681,13 @@ p_tv(struct shf *shf, int posix, struct 
 char *suffix)
 {
if (posix)
-   shf_fprintf(shf, "%s%*lld.%02ld%s", prefix ? prefix : "",
-   width, (long long)tv->tv_sec, tv->tv_usec / 1, suffix);
+   shf_fprintf(shf, "%s%*lld.%03ld%s", prefix ? prefix : "",
+   width, (long long)tv->tv_sec, tv->tv_usec / 1000, suffix);
else
-   shf_fprintf(shf, "%s%*lldm%02lld.%02lds%s", prefix ? prefix : 
"",
+   shf_fprintf(shf, "%s%*lldm%02lld.%03lds%s", prefix ? prefix : 
"",
width, (long long)tv->tv_sec / 60,
(long long)tv->tv_sec % 60,
-   tv->tv_usec / 1, suffix);
+   tv->tv_usec / 1000, suffix);
 }
 
 static void
@@ -695,14 +695,14 @@ p_ts(struct shf *shf, int posix, struct 
 char *suffix)
 {
if (posix)
-   shf_fprintf(shf, "%s%*lld.%02ld%s", prefix ? prefix : "",
-   width, (long long)ts->tv_sec, ts->tv_nsec / 1000,
+   shf_fprintf(shf, "%s%*lld.%03ld%s", prefix ? prefix : "",
+   width, (long long)ts->tv_sec, ts->tv_nsec / 100,
suffix);
else
-   shf_fprintf(shf, "%s%*lldm%02lld.%02lds%s", prefix ? prefix : 
"",
+ 

all platforms, main(): call clockqueue_init() just before sched_init_cpu()

2023-06-12 Thread Scott Cheloha
We need to initialize the per-CPU clockintr_queue struct before we can
call clockintr_establish() from sched_init_cpu().

Initialization is done with a call to clockqueue_init().  Currently we
call it during clockintr_cpu_init(), i.e. each CPU initializes its own
clockintr_queue struct.

This patch moves the clockqueue_init() call out into main() and out
into the MD code, just before sched_init_cpu().  So, now the primary
CPU initializes the clockintr_queue struct on behalf of the secondary
CPUs.

No behavior change.

With this in place, we can start breaking pieces off of the
hardclock() and statclock() in the next patch.

ok?

Index: kern/init_main.c
===
RCS file: /cvs/src/sys/kern/init_main.c,v
retrieving revision 1.320
diff -u -p -r1.320 init_main.c
--- kern/init_main.c1 Jan 2023 07:00:51 -   1.320
+++ kern/init_main.c12 Jun 2023 23:55:43 -
@@ -47,6 +47,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -313,6 +314,7 @@ main(void *framep)
/* Initialize run queues */
sched_init_runqueues();
sleep_queue_init();
+   clockqueue_init(()->ci_queue);
sched_init_cpu(curcpu());
p->p_cpu->ci_randseed = (arc4random() & 0x7fff) + 1;
 
Index: kern/kern_clockintr.c
===
RCS file: /cvs/src/sys/kern/kern_clockintr.c,v
retrieving revision 1.21
diff -u -p -r1.21 kern_clockintr.c
--- kern/kern_clockintr.c   23 Apr 2023 00:08:36 -  1.21
+++ kern/kern_clockintr.c   12 Jun 2023 23:55:43 -
@@ -66,7 +66,6 @@ void clockintr_schedule(struct clockintr
 void clockintr_schedule_locked(struct clockintr *, uint64_t);
 void clockintr_statclock(struct clockintr *, void *);
 void clockintr_statvar_init(int, uint32_t *, uint32_t *, uint32_t *);
-void clockqueue_init(struct clockintr_queue *);
 uint64_t clockqueue_next(const struct clockintr_queue *);
 void clockqueue_reset_intrclock(struct clockintr_queue *);
 uint64_t nsec_advance(uint64_t *, uint64_t, uint64_t);
@@ -114,7 +113,6 @@ clockintr_cpu_init(const struct intrcloc
 
KASSERT(ISSET(clockintr_flags, CL_INIT));
 
-   clockqueue_init(cq);
if (ic != NULL && !ISSET(cq->cq_flags, CQ_INTRCLOCK)) {
cq->cq_intrclock = *ic;
SET(cq->cq_flags, CQ_INTRCLOCK);
Index: sys/clockintr.h
===
RCS file: /cvs/src/sys/sys/clockintr.h,v
retrieving revision 1.7
diff -u -p -r1.7 clockintr.h
--- sys/clockintr.h 20 Apr 2023 14:51:28 -  1.7
+++ sys/clockintr.h 12 Jun 2023 23:55:43 -
@@ -129,6 +129,7 @@ void clockintr_trigger(void);
  * Kernel API
  */
 
+void clockqueue_init(struct clockintr_queue *);
 int sysctl_clockintr(int *, u_int, void *, size_t *, void *, size_t);
 
 #endif /* _KERNEL */
Index: arch/alpha/alpha/cpu.c
===
RCS file: /cvs/src/sys/arch/alpha/alpha/cpu.c,v
retrieving revision 1.46
diff -u -p -r1.46 cpu.c
--- arch/alpha/alpha/cpu.c  10 Dec 2022 15:02:29 -  1.46
+++ arch/alpha/alpha/cpu.c  12 Jun 2023 23:55:43 -
@@ -597,6 +597,7 @@ cpu_hatch(struct cpu_info *ci)
ALPHA_TBIA();
alpha_pal_imb();
 
+   clockqueue_init(>ci_queue);
KERNEL_LOCK();
sched_init_cpu(ci);
nanouptime(>ci_schedstate.spc_runtime);
Index: arch/amd64/amd64/cpu.c
===
RCS file: /cvs/src/sys/arch/amd64/amd64/cpu.c,v
retrieving revision 1.168
diff -u -p -r1.168 cpu.c
--- arch/amd64/amd64/cpu.c  24 Apr 2023 09:04:03 -  1.168
+++ arch/amd64/amd64/cpu.c  12 Jun 2023 23:55:43 -
@@ -664,6 +664,7 @@ cpu_attach(struct device *parent, struct
 #if defined(MULTIPROCESSOR)
cpu_intr_init(ci);
cpu_start_secondary(ci);
+   clockqueue_init(>ci_queue);
sched_init_cpu(ci);
ncpus++;
if (ci->ci_flags & CPUF_PRESENT) {
Index: arch/arm/arm/cpu.c
===
RCS file: /cvs/src/sys/arch/arm/arm/cpu.c,v
retrieving revision 1.57
diff -u -p -r1.57 cpu.c
--- arch/arm/arm/cpu.c  12 Mar 2022 14:40:41 -  1.57
+++ arch/arm/arm/cpu.c  12 Jun 2023 23:55:43 -
@@ -391,6 +391,7 @@ cpu_attach(struct device *parent, struct
"cpu-release-addr", 0);
}
 
+   clockqueue_init(>ci_queue);
sched_init_cpu(ci);
if (cpu_hatch_secondary(ci, spinup_method, spinup_data)) {
atomic_setbits_int(>ci_flags, CPUF_IDENTIFY);
Index: arch/arm64/arm64/cpu.c
===
RCS file: /cvs/src/sys/arch/arm64/arm64/cpu.c,v
retrieving revision 1.94
diff -u -p -r1.94 cpu.c
--- 

tc_init.9: misc. cleanup

2023-03-28 Thread Scott Cheloha
I would like to spruce up this manpage.

- Try to describe what kern_tc.c does more completely and a bit
  more plainly.

- Mention *all* the requirements.  Try to describe the rollover
  margin in plainer language.

- Revise field descriptions for struct timecounter.  No need to
  mention fields the driver doesn't need to initialize.  Document
  the new-ish tc_user field.

- Add a CONTEXT section.

- In SEE ALSO, switch to an https URI on the main freebsd.org
  website.

- In HISTORY, note that the code first appeared in FreeBSD 3.0.
  It was later ported to OpenBSD for the 3.6 release.

- Add an AUTHORS section.

Index: tc_init.9
===
RCS file: /cvs/src/share/man/man9/tc_init.9,v
retrieving revision 1.11
diff -u -p -r1.11 tc_init.9
--- tc_init.9   4 Feb 2023 19:19:36 -   1.11
+++ tc_init.9   29 Mar 2023 00:21:27 -
@@ -1,6 +1,7 @@
-.\"$OpenBSD: tc_init.9,v 1.11 2023/02/04 19:19:36 cheloha Exp $
+.\"$OpenBSD: tc_init.9,v 1.10 2023/01/17 10:10:11 jsg Exp $
 .\"
 .\" Copyright (c) 2004 Alexander Yurchenko 
+.\" Copyright (c) 2023 Scott Cheloha 
 .\"
 .\" Permission to use, copy, modify, and distribute this software for any
 .\" purpose with or without fee is hereby granted, provided that the above
@@ -14,83 +15,109 @@
 .\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 .\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 .\"
-.Dd $Mdocdate: February 4 2023 $
+.Dd $Mdocdate: January 17 2023 $
 .Dt TC_INIT 9
 .Os
 .Sh NAME
 .Nm tc_init
-.Nd machine-independent binary timescale
+.Nd timecounting subsystem
 .Sh SYNOPSIS
 .In sys/timetc.h
 .Ft void
 .Fn tc_init "struct timecounter *tc"
 .Sh DESCRIPTION
-The timecounter interface is a machine-independent implementation
-of a binary timescale using whatever hardware support is at hand
-for tracking time.
+The
+.Sy timecounting
+subsystem implements a uniform interface to timekeeping hardware,
+measures the passage of time,
+and implements the kernel's software clocks
+.Po see
+.Xr microtime 9
+for details
+.Pc .
 .Pp
-A timecounter is a binary counter which has two properties:
-.Bl -bullet -offset indent
+A hardware clock is suitable for counting time if it meets the following
+requirements:
+.Bl -enum -offset indent
+.It
+It is a binary counter.
+.It
+It advances at a fixed, known frequency.
+.It
+Its count is synchronized between all CPUs on the system.
 .It
-it runs at a fixed, known frequency
+It continues counting when it rolls over.
 .It
-it has sufficient bits to not roll over in less than approximately
-max(2 msec, 2/HZ seconds) (the value 2 here is really 1 + delta, for some
-indeterminate value of delta)
+If
+.Xr hz 9
+is less than or equal to one millisecond,
+the counter does not roll over in less than two milliseconds.
+If
+.Xr hz 9
+exceeds one millisecond,
+the counter does not roll over in less than
+.Pq 2 / Va hz
+seconds.
 .El
 .Pp
-The interface between the hardware which implements a timecounter and the
-machine-independent code which uses this to keep track of time is a
+Hardware clocks are described with a
 .Va timecounter
 structure:
 .Bd -literal -offset indent
 struct timecounter {
-   timecounter_get_t   *tc_get_timecount;
-   u_int   tc_counter_mask;
-   u_int64_t   tc_frequency;
-   char*tc_name;
-   int tc_quality;
-   void*tc_priv;
-   struct timecounter  *tc_next;
-}
+   u_int (*tc_get_timecount)(struct timecounter *);
+   u_int tc_counter_mask;
+   u_int64_t tc_frequency;
+   char *tc_name;
+   int tc_quality;
+   void *tc_priv;
+   u_int tc_user;
+};
 .Ed
-.Pp
-The fields of the
-.Va timecounter
-structure are described below.
 .Bl -tag -width indent
 .It Ft u_int Fn (*tc_get_timecount) "struct timecounter *"
-This function reads the counter.
-It is not required to mask any unimplemented bits out, as long as they
-are constant.
+Reads the hardware clock and returns its count.
+Any unimplemented bits only need to be masked if they are not constant.
+If the counter is larger than 32 bits,
+this function must return a 32-bit subset.
+The subsystem requires an upward count;
+downward counts must be inverted before they are returned.
 .It Va tc_counter_mask
-This mask should mask off any unimplemented bits.
+The mask of implemented bits.
+Used to discard unimplemented bits from
+.Fn tc_get_timecount .
 .It Va tc_frequency
-Frequency of the counter in Hz.
+The counter's fixed frequency.
 .It Va tc_name
-Name of the timecounter.
-Can be any null-terminated string.
+The counter's unique name.
+A
+.Dv NUL Ns -terminated string.
 .It Va tc_quality
-Used to determine if this timecounter is better than another timecounter \-
-higher means better.
-If this field is negative, the counter is only us

Re: timer(4/sparc64): remove driver

2023-03-25 Thread Scott Cheloha
On Fri, Mar 24, 2023 at 09:56:55AM +0100, Claudio Jeker wrote:
> On Thu, Mar 23, 2023 at 05:37:05PM -0500, Scott Cheloha wrote:
> > On Sat, Mar 18, 2023 at 12:17:33PM -0600, Ted Bullock wrote:
> > > On 2023-03-17 12:29 p.m., Mike Larkin wrote:
> > > > On Thu, Mar 16, 2023 at 12:25:15PM -0500, Scott Cheloha wrote:
> > > >> This code has been dead since we switched sparc64 to clockintr several
> > > >> months ago.  Nobody has come forward asking for a timer(4/sparc64)
> > > >> intrclock.
> > > >>
> > > >> As of now, you need %TICK_CMPR or %STICK_CMPR to run OpenBSD on
> > > >> sparc64.  The only machines maybe lacking these registers are certain
> > > >> early HAL/Fujitsu models like SPARC64 I and II, and maybe SPARC64 III.
> > > >>
> > > >> We can remove the driver now or wait until after unlock.
> > > >>
> > > >> The driver implementation is mixed into sparc64/clock.c.  I think I
> > > >> got everything, but I'm not positive.
> > > > 
> > > > I would just wait until after unlock. ok mlarkin for removing dead code
> > > > once unlock happens.
> > > 
> > > Is that specific to Fujitsu variants of Sparc64 or will that hit Sun
> > > variants too? I use the sunblade 100, which uses has a IIe I think but
> > > this one is a sun branded chip as far as I'm aware.
> > 
> > Your SunBlade 100 is unaffected by this change.
> > 
> > All Sun/Oracle SPARC V9 CPUs ever shipped have ASR23 (%TICK_CMPR).
> > Starting with the UltaSPARC IIe, they all also have the %STICK and
> > %STICK_CMPR registers.
> > 
> > The only machines that maybe don't have %TICK_CMPR, and are possibly
> > already broken by changes I made months ago, are the first HAL CPUs:
> > SPARC64 I and II.  I can't be sure because I can't find documentation
> > for those specific CPUs online.
> > 
> > The HAL SPARC64 III and all later HAL/Fujitsu CPUs have ASR23 (dubbed
> > %TICK_MATCH in the early HAL docs), so all those CPUs are unaffected,
> > too.
> 
> HAL SPARC64 I - III are currently not supported. The TLB and IIRC cache 
> handling is different.

Wonderful!  Nothing to worry about then.  Removing this driver impacts
no supported CPUs.



Re: timer(4/sparc64): remove driver

2023-03-23 Thread Scott Cheloha
On Sat, Mar 18, 2023 at 12:17:33PM -0600, Ted Bullock wrote:
> On 2023-03-17 12:29 p.m., Mike Larkin wrote:
> > On Thu, Mar 16, 2023 at 12:25:15PM -0500, Scott Cheloha wrote:
> >> This code has been dead since we switched sparc64 to clockintr several
> >> months ago.  Nobody has come forward asking for a timer(4/sparc64)
> >> intrclock.
> >>
> >> As of now, you need %TICK_CMPR or %STICK_CMPR to run OpenBSD on
> >> sparc64.  The only machines maybe lacking these registers are certain
> >> early HAL/Fujitsu models like SPARC64 I and II, and maybe SPARC64 III.
> >>
> >> We can remove the driver now or wait until after unlock.
> >>
> >> The driver implementation is mixed into sparc64/clock.c.  I think I
> >> got everything, but I'm not positive.
> > 
> > I would just wait until after unlock. ok mlarkin for removing dead code
> > once unlock happens.
> 
> Is that specific to Fujitsu variants of Sparc64 or will that hit Sun
> variants too? I use the sunblade 100, which uses has a IIe I think but
> this one is a sun branded chip as far as I'm aware.

Your SunBlade 100 is unaffected by this change.

All Sun/Oracle SPARC V9 CPUs ever shipped have ASR23 (%TICK_CMPR).
Starting with the UltaSPARC IIe, they all also have the %STICK and
%STICK_CMPR registers.

The only machines that maybe don't have %TICK_CMPR, and are possibly
already broken by changes I made months ago, are the first HAL CPUs:
SPARC64 I and II.  I can't be sure because I can't find documentation
for those specific CPUs online.

The HAL SPARC64 III and all later HAL/Fujitsu CPUs have ASR23 (dubbed
%TICK_MATCH in the early HAL docs), so all those CPUs are unaffected,
too.



timer(4/sparc64): remove driver

2023-03-16 Thread Scott Cheloha
This code has been dead since we switched sparc64 to clockintr several
months ago.  Nobody has come forward asking for a timer(4/sparc64)
intrclock.

As of now, you need %TICK_CMPR or %STICK_CMPR to run OpenBSD on
sparc64.  The only machines maybe lacking these registers are certain
early HAL/Fujitsu models like SPARC64 I and II, and maybe SPARC64 III.

We can remove the driver now or wait until after unlock.

The driver implementation is mixed into sparc64/clock.c.  I think I
got everything, but I'm not positive.

Index: distrib/sets/lists/man/mi
===
RCS file: /cvs/src/distrib/sets/lists/man/mi,v
retrieving revision 1.1695
diff -u -p -r1.1695 mi
--- distrib/sets/lists/man/mi   14 Mar 2023 04:51:34 -  1.1695
+++ distrib/sets/lists/man/mi   16 Mar 2023 17:18:43 -
@@ -1979,7 +1979,6 @@
 ./usr/share/man/man4/sparc64/spif.4
 ./usr/share/man/man4/sparc64/ssm.4
 ./usr/share/man/man4/sparc64/tda.4
-./usr/share/man/man4/sparc64/timer.4
 ./usr/share/man/man4/sparc64/tvtwo.4
 ./usr/share/man/man4/sparc64/upa.4
 ./usr/share/man/man4/sparc64/uperf.4
Index: share/man/man4/man4.sparc64/Makefile
===
RCS file: /cvs/src/share/man/man4/man4.sparc64/Makefile,v
retrieving revision 1.82
diff -u -p -r1.82 Makefile
--- share/man/man4/man4.sparc64/Makefile25 Apr 2019 16:47:56 -  
1.82
+++ share/man/man4/man4.sparc64/Makefile16 Mar 2023 17:18:43 -
@@ -9,7 +9,7 @@ MAN=agten.4 apio.4 asio.4 audioce.4 aud
pcons.4 pmc.4 power.4 ppm.4 prtc.4 psycho.4 pyro.4 qe.4 qec.4 \
radeonfb.4 raptor.4 rfx.4 \
sab.4 sbbc.4 schizo.4 spif.4 ssm.4 \
-   tda.4 timer.4 tvtwo.4 upa.4 uperf.4 \
+   tda.4 tvtwo.4 upa.4 uperf.4 \
vbus.4 vcc.4 vcons.4 vds.4 vdsk.4 vigra.4 vldc.4 vnet.4 vpci.4 \
vrng.4 vrtc.4 vsw.4 \
xbox.4 zs.4 zx.4
Index: share/man/man4/man4.sparc64/timer.4
===
RCS file: share/man/man4/man4.sparc64/timer.4
diff -N share/man/man4/man4.sparc64/timer.4
--- share/man/man4/man4.sparc64/timer.4 31 May 2007 19:19:57 -  1.2
+++ /dev/null   1 Jan 1970 00:00:00 -
@@ -1,43 +0,0 @@
-.\" $OpenBSD: timer.4,v 1.2 2007/05/31 19:19:57 jmc Exp $
-.\"
-.\" Copyright (c) 2004 Jason L. Wright (ja...@thought.net)
-.\" All rights reserved.
-.\"
-.\" Redistribution and use in source and binary forms, with or without
-.\" modification, are permitted provided that the following conditions
-.\" are met:
-.\" 1. Redistributions of source code must retain the above copyright
-.\"notice, this list of conditions and the following disclaimer.
-.\" 2. Redistributions in binary form must reproduce the above copyright
-.\"notice, this list of conditions and the following disclaimer in the
-.\"documentation and/or other materials provided with the distribution.
-.\"
-.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
-.\" IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-.\" WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-.\" DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
-.\" INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-.\" (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-.\" SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-.\" STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-.\" ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-.\" POSSIBILITY OF SUCH DAMAGE.
-.\"
-.Dd $Mdocdate: May 31 2007 $
-.Dt TIMER 4 sparc64
-.Os
-.Sh NAME
-.Nm timer
-.Nd SPARC64 Timer
-.Sh SYNOPSIS
-.Cd "timer* at mainbus0"
-.Sh DESCRIPTION
-The
-.Nm
-device provides support for the onboard timer on SBus based
-UltraSPARC machines.
-The timers are used to control various time services in the
-kernel and are not user accessible.
-.Sh SEE ALSO
-.Xr intro 4
Index: sys/arch/sparc64/conf/GENERIC
===
RCS file: /cvs/src/sys/arch/sparc64/conf/GENERIC,v
retrieving revision 1.322
diff -u -p -r1.322 GENERIC
--- sys/arch/sparc64/conf/GENERIC   2 Jan 2022 23:14:27 -   1.322
+++ sys/arch/sparc64/conf/GENERIC   16 Mar 2023 17:18:43 -
@@ -342,9 +342,6 @@ clkbrd* at fhc?
 ## PROM clock -- if all else failse
 prtc0  at mainbus0
 
-## Timer chip found on (some) sun4u systems.
-timer* at mainbus0
-
 # Virtual devices for sun4v systems.
 vcons0 at vbus?
 vrtc0  at vbus?
Index: sys/arch/sparc64/conf/RAMDISK
===
RCS file: /cvs/src/sys/arch/sparc64/conf/RAMDISK,v
retrieving revision 1.126
diff -u -p -r1.126 RAMDISK
--- sys/arch/sparc64/conf/RAMDISK   15 Jul 2021 15:37:55 -  1.126
+++ 

Re: kernel: don't jump ticks, jiffies during boot

2023-03-01 Thread Scott Cheloha
On Mon, Feb 27, 2023 at 08:48:53PM -0600, Scott Cheloha wrote:
> On Tue, Feb 28, 2023 at 01:01:32PM +1100, Jonathan Gray wrote:
> > On Mon, Feb 27, 2023 at 06:26:00PM -0600, Scott Cheloha wrote:
> > > On Tue, Feb 28, 2023 at 10:18:16AM +1100, Jonathan Gray wrote:
> > > > On Mon, Feb 27, 2023 at 04:57:04PM -0600, Scott Cheloha wrote:
> > > > > ticks and jiffies start at zero.  During boot in initclocks(), we
> > > > > reset them:
> > > > > 
> > > > >   /* sys/kern/kern_clock.c */
> > > > > 
> > > > > 89int ticks;
> > > > > 90static int psdiv, pscnt;/* prof => stat 
> > > > > divider */
> > > > > 91int psratio;/* ratio: prof 
> > > > > / stat */
> > > > > 92
> > > > > 93volatile unsigned long jiffies; /* XXX Linux 
> > > > > API for drm(4) */
> > > > > 94
> > > > > 95/*
> > > > > 96 * Initialize clock frequencies and start both clocks 
> > > > > running.
> > > > > 97 */
> > > > > 98void
> > > > > 99initclocks(void)
> > > > >100{
> > > > >101ticks = INT_MAX - (15 * 60 * hz);
> > > > >102jiffies = ULONG_MAX - (10 * 60 * hz);
> > > > >103
> > > > >104/* [... ] */
> > > > > 
> > > > > The idea here (committed by dlg@) is sound.  We reset ticks and
> > > > > jiffies to near-rollover values to catch buggy code misusing them.
> > > > > 
> > > > > But!  That jump from zero to whatever violates valid assumptions made
> > > > > by correct code, too.
> > > > 
> > > > Assumptions made by what code?  Does it exist in the tree?
> > > 
> > > First, even if the code did not exist, wouldn't it be simpler to not
> > > do the jump?  No?
> > 
> > There are enough problems to fix without chasing ones that
> > don't exist.
> > 
> > > Second, with rare exception, all kernel code using ticks/jiffies
> > > assumes ticks/jiffies does not advance more than once every 1/hz
> > > seconds on average.
> > > 
> > > In timeout_add(9), we assign an absolute expiration time relative
> > > to the current value of ticks.  Code calling timeout_add(9) before
> > > initclocks() cannot account for the jump in initclocks().
> > 
> > What code calling timeout_add() before initclocks()?
> 
> I count 8 calls on my laptop:
> 
> [...]
> 
> Jumping ticks/jiffies as we do in initclocks() violates assumptions
> about how ticks/jiffies work.  If we initialize ticks/jiffies to
> values near rollover we can keep the test of correct behavior intended
> by dlg's patch without surprising other code.
> 
> ok?
> 
> Is there a better place to put "jiffies"?

kettenis@ suggested finding a way to initialize ticks/jiffies without
moving them.

HZ is not visible outside of param.c.  If we move the conditional
definition of HZ from param.c into sys/kernel.h (where hz(9) is
extern'd), kern_clock.c can use it.  I don't see a more obvious place
to put HZ than sys/kernel.h.

This fixes the problem.  The timeouts no longer fire early.

Can I go with this?

Index: kern/kern_clock.c
===
RCS file: /cvs/src/sys/kern/kern_clock.c,v
retrieving revision 1.106
diff -u -p -r1.106 kern_clock.c
--- kern/kern_clock.c   4 Feb 2023 19:33:03 -   1.106
+++ kern/kern_clock.c   1 Mar 2023 16:34:50 -
@@ -86,11 +86,11 @@ int stathz;
 intschedhz;
 intprofhz;
 intprofprocs;
-intticks;
+intticks = INT_MAX - (15 * 60 * HZ);
 static int psdiv, pscnt;   /* prof => stat divider */
 intpsratio;/* ratio: prof / stat */
 
-volatile unsigned long jiffies;/* XXX Linux API for drm(4) */
+volatile unsigned long jiffies = ULONG_MAX - (10 * 60 * HZ);
 
 /*
  * Initialize clock frequencies and start both clocks running.
@@ -98,9 +98,6 @@ volatile unsigned long jiffies;   /* XXX 
 void
 initclocks(void)
 {
-   ticks = INT_MAX - (15 * 60 * hz);
-   jiffies = ULONG_MAX - (10 * 60 * hz);
-
/*
 * Set divisors to 1 (normal case) and let the machine-specific
 * code do its bit.
Index: sys/kernel.h
===
RCS

Re: kernel: don't jump ticks, jiffies during boot

2023-02-27 Thread Scott Cheloha
On Tue, Feb 28, 2023 at 01:01:32PM +1100, Jonathan Gray wrote:
> On Mon, Feb 27, 2023 at 06:26:00PM -0600, Scott Cheloha wrote:
> > On Tue, Feb 28, 2023 at 10:18:16AM +1100, Jonathan Gray wrote:
> > > On Mon, Feb 27, 2023 at 04:57:04PM -0600, Scott Cheloha wrote:
> > > > ticks and jiffies start at zero.  During boot in initclocks(), we
> > > > reset them:
> > > > 
> > > > /* sys/kern/kern_clock.c */
> > > > 
> > > > 89  int ticks;
> > > > 90  static int psdiv, pscnt;/* prof => stat divider 
> > > > */
> > > > 91  int psratio;/* ratio: prof / stat */
> > > > 92  
> > > > 93  volatile unsigned long jiffies; /* XXX Linux API for 
> > > > drm(4) */
> > > > 94  
> > > > 95  /*
> > > > 96   * Initialize clock frequencies and start both clocks running.
> > > > 97   */
> > > > 98  void
> > > > 99  initclocks(void)
> > > >100  {
> > > >101  ticks = INT_MAX - (15 * 60 * hz);
> > > >102  jiffies = ULONG_MAX - (10 * 60 * hz);
> > > >103  
> > > >104  /* [... ] */
> > > > 
> > > > The idea here (committed by dlg@) is sound.  We reset ticks and
> > > > jiffies to near-rollover values to catch buggy code misusing them.
> > > > 
> > > > But!  That jump from zero to whatever violates valid assumptions made
> > > > by correct code, too.
> > > 
> > > Assumptions made by what code?  Does it exist in the tree?
> > 
> > First, even if the code did not exist, wouldn't it be simpler to not
> > do the jump?  No?
> 
> There are enough problems to fix without chasing ones that
> don't exist.
> 
> > Second, with rare exception, all kernel code using ticks/jiffies
> > assumes ticks/jiffies does not advance more than once every 1/hz
> > seconds on average.
> > 
> > In timeout_add(9), we assign an absolute expiration time relative
> > to the current value of ticks.  Code calling timeout_add(9) before
> > initclocks() cannot account for the jump in initclocks().
> 
> What code calling timeout_add() before initclocks()?

I count 8 calls on my laptop:

#0  timeout_add+0x43
#1  random_start+0xb7
#2  main+0x96
#3  longmode_hi+0x9c

#0  timeout_add+0x43
#1  thinkpad_attach+0x1f9
#2  config_attach+0x1f4
#3  acpi_foundhid+0x326
#4  aml_find_node+0x74
#5  aml_find_node+0xa1
#6  aml_find_node+0xa1
#7  aml_find_node+0xa1
#8  aml_find_node+0xa1
#9  aml_find_node+0xa1
#10 acpi_attach_common+0x6f4
#11 config_attach+0x1f4
#12 bios_attach+0x74f
#13 config_attach+0x1f4
#14 mainbus_attach+0x7b
#15 config_attach+0x1f4
#16 config_rootfound+0xd2
#17 cpu_configure+0x2a
#18 main+0x3a8

#0  timeout_add+0x43
#1  acpibat_attach+0x171
#2  config_attach+0x1f4
#3  acpi_foundhid+0x326
#4  aml_find_node+0x74
#5  aml_find_node+0xa1
#6  aml_find_node+0xa1
#7  aml_find_node+0xa1
#8  aml_find_node+0xa1
#9  aml_find_node+0xa1
#10 acpi_attach_common+0x6f4
#11 config_attach+0x1f4
#12 bios_attach+0x74f
#13 config_attach+0x1f4
#14 mainbus_attach+0x7b
#15 config_attach+0x1f4
#16 config_rootfound+0xd2
#17 cpu_configure+0x2a
#18 main+0x3a8

#0  timeout_add+0x43
#1  acpitz_attach+0x559
#2  config_attach+0x1f4
#3  acpi_add_device+0x147
#4  aml_walknodes+0x3b
#5  aml_walknodes+0x61
#6  aml_walknodes+0x61
#7  acpi_attach_common+0x712
#8  config_attach+0x1f4
#9  bios_attach+0x74f
#10 config_attach+0x1f4
#11 mainbus_attach+0x7b
#12 config_attach+0x1f4
#13 config_rootfound+0xd2
#14 cpu_configure+0x2a
#15 main+0x3a8

#0  timeout_add+0x43
#1  if_attachsetup+0x102
#2  if_attach+0x4e
#3  iwm_attach+0xe5a
#4  config_attach+0x1f4
#5  pci_probe_device+0x515
#6  pci_enumerate_bus+0x189
#7  config_attach+0x1f4
#8  ppbattach+0x790
#9  config_attach+0x1f4
#10 pci_probe_device+0x515
#11 pci_enumerate_bus+0x189
#12 config_attach+0x1f4
#13 acpipci_attach_bus+0x1b3
#14 acpipci_attach_busses+0x4d
#15 mainbus_attach+0x1c6
#16 config_attach+0x1f4
#17 config_rootfound+0xd2
#18 cpu_configure+0x2a

#0  timeout_add+0x43
#1  if_attachsetup+0x102
#2  if_attach+0x4e
#3  em_setup_interface+0x1c6
#4  em_attach+0x401
#5  config_attach+0x1f4
#6  pci_probe_device+0x515
#7  pci_enumerate_bus+0x189
#8  config_attach+0x1f4
#9  acpipci_attach_bus+0x1b3
#10 acpipci_attach_busses+0x4d
#11 mainbus_attach+0x1c6
#12 config_attach+0x1f4
#13 config_rootfound+0xd2
#14 cpu_configure+0x2a
#15 main+0x3a8

#0  timeout_add+0x43
#1  pckbdattach+0x172
#2  config_attach+0x1f4
#3  pckbc_attach+0x239
#4  pckbc_isa_attach+0x17c
#5  config_attach+0x1f4
#6  isascan+0x309
#7  config_scan+0xab
#8  config_attach+0x1f4
#9  pcib_callb

Re: kernel: don't jump ticks, jiffies during boot

2023-02-27 Thread Scott Cheloha
On Tue, Feb 28, 2023 at 10:18:16AM +1100, Jonathan Gray wrote:
> On Mon, Feb 27, 2023 at 04:57:04PM -0600, Scott Cheloha wrote:
> > ticks and jiffies start at zero.  During boot in initclocks(), we
> > reset them:
> > 
> > /* sys/kern/kern_clock.c */
> > 
> > 89  int ticks;
> > 90  static int psdiv, pscnt;/* prof => stat divider 
> > */
> > 91  int psratio;/* ratio: prof / stat */
> > 92  
> > 93  volatile unsigned long jiffies; /* XXX Linux API for 
> > drm(4) */
> > 94  
> > 95  /*
> > 96   * Initialize clock frequencies and start both clocks running.
> > 97   */
> > 98  void
> > 99  initclocks(void)
> >100  {
> >101  ticks = INT_MAX - (15 * 60 * hz);
> >102  jiffies = ULONG_MAX - (10 * 60 * hz);
> >103  
> >104  /* [... ] */
> > 
> > The idea here (committed by dlg@) is sound.  We reset ticks and
> > jiffies to near-rollover values to catch buggy code misusing them.
> > 
> > But!  That jump from zero to whatever violates valid assumptions made
> > by correct code, too.
> 
> Assumptions made by what code?  Does it exist in the tree?

First, even if the code did not exist, wouldn't it be simpler to not
do the jump?  No?

Second, with rare exception, all kernel code using ticks/jiffies
assumes ticks/jiffies does not advance more than once every 1/hz
seconds on average.

In timeout_add(9), we assign an absolute expiration time relative
to the current value of ticks.  Code calling timeout_add(9) before
initclocks() cannot account for the jump in initclocks().

There is probably equivalent code in drm(4) making the same
assumption.

Relatedly, in cpu_relax() we increment jiffies if the kernel is
cold:

sys/dev/pci/drm/include/linux/processor.h

12  static inline void
13  cpu_relax(void)
14  {
15  CPU_BUSY_CYCLE();
16  if (cold) {
17  delay(tick);
18  jiffies++;
19  }
20  }



kernel: don't jump ticks, jiffies during boot

2023-02-27 Thread Scott Cheloha
ticks and jiffies start at zero.  During boot in initclocks(), we
reset them:

/* sys/kern/kern_clock.c */

89  int ticks;
90  static int psdiv, pscnt;/* prof => stat divider */
91  int psratio;/* ratio: prof / stat */
92  
93  volatile unsigned long jiffies; /* XXX Linux API for drm(4) */
94  
95  /*
96   * Initialize clock frequencies and start both clocks running.
97   */
98  void
99  initclocks(void)
   100  {
   101  ticks = INT_MAX - (15 * 60 * hz);
   102  jiffies = ULONG_MAX - (10 * 60 * hz);
   103  
   104  /* [... ] */

The idea here (committed by dlg@) is sound.  We reset ticks and
jiffies to near-rollover values to catch buggy code misusing them.

But!  That jump from zero to whatever violates valid assumptions made
by correct code, too.

It would be better to just initialize ticks and jiffies to the
near-rollover values when we declare them.  To do this we need to
move their declarations from sys/kern/kern_clock.c to sys/conf/param.c
where HZ is visible.

ok?

Index: kern/kern_clock.c
===
RCS file: /cvs/src/sys/kern/kern_clock.c,v
retrieving revision 1.106
diff -u -p -r1.106 kern_clock.c
--- kern/kern_clock.c   4 Feb 2023 19:33:03 -   1.106
+++ kern/kern_clock.c   27 Feb 2023 22:55:24 -
@@ -86,21 +86,15 @@ int stathz;
 intschedhz;
 intprofhz;
 intprofprocs;
-intticks;
 static int psdiv, pscnt;   /* prof => stat divider */
 intpsratio;/* ratio: prof / stat */
 
-volatile unsigned long jiffies;/* XXX Linux API for drm(4) */
-
 /*
  * Initialize clock frequencies and start both clocks running.
  */
 void
 initclocks(void)
 {
-   ticks = INT_MAX - (15 * 60 * hz);
-   jiffies = ULONG_MAX - (10 * 60 * hz);
-
/*
 * Set divisors to 1 (normal case) and let the machine-specific
 * code do its bit.
@@ -171,7 +165,8 @@ hardclock(struct clockframe *frame)
 
tc_ticktock();
ticks++;
-   jiffies++;
+   extern volatile unsigned long jiffies;
+   jiffies++;  /* XXX drm(4) */
 
/*
 * Update the timeout wheel.
Index: conf/param.c
===
RCS file: /cvs/src/sys/conf/param.c,v
retrieving revision 1.47
diff -u -p -r1.47 param.c
--- conf/param.c13 Apr 2022 10:08:10 -  1.47
+++ conf/param.c27 Feb 2023 22:55:24 -
@@ -73,6 +73,8 @@
 #defineHZ 100
 #endif
 inthz = HZ;
+intticks = INT_MAX - (15 * 60 * HZ);
+volatile unsigned long jiffies = ULONG_MAX - (10 * 60 * HZ);   /* drm(4) */
 inttick = 100 / HZ;
 inttick_nsec = 10 / HZ;
 intutc_offset = 0;



timecounting: remove incomplete PPS support

2023-01-31 Thread Scott Cheloha
When the timecounting code was ported from FreeBSD in 2004 [1], stubs
for pulse-per-second (PPS) polling were brought in but left disabled.
They remain disabled [2]:

1.1   tholo 710:
711: #ifdef notyet
712:/*
713: * Hardware latching timecounters may not 
generate interrupts on
714: * PPS events, so instead we poll them.  There 
is a finite risk that
715: * the hardware might capture a count which is 
later than the one we
716: * got above, and therefore possibly in the 
next NTP second which might
717: * have a different rate than the current NTP 
second.  It doesn't
718: * matter in practice.
719: */
720:if (tho->th_counter->tc_poll_pps)
721:
tho->th_counter->tc_poll_pps(tho->th_counter);
722: #endif

It's been almost two decades.  I don't expect anyone to finish adding
support, so let's remove the stubs.

This patch gets rid of the tc_poll_pps symbol.

otto: No plans to use this?

ok?

[1] http://cvsweb.openbsd.org/cgi-bin/cvsweb/src/sys/kern/kern_tc.c?annotate=1.1
[2] 
http://cvsweb.openbsd.org/cgi-bin/cvsweb/src/sys/kern/kern_tc.c?annotate=1.81

Index: sys/sys/timetc.h
===
RCS file: /cvs/src/sys/sys/timetc.h,v
retrieving revision 1.13
diff -u -p -r1.13 timetc.h
--- sys/sys/timetc.h12 Aug 2022 02:20:36 -  1.13
+++ sys/sys/timetc.h31 Jan 2023 18:12:57 -
@@ -44,7 +44,6 @@
 
 struct timecounter;
 typedef u_int timecounter_get_t(struct timecounter *);
-typedef void timecounter_pps_t(struct timecounter *);
 
 /*
  * Locks used to protect struct members in this file:
@@ -59,13 +58,6 @@ struct timecounter {
 * This function reads the counter.  It is not required to
 * mask any unimplemented bits out, as long as they are
 * constant.
-*/
-   timecounter_pps_t   *tc_poll_pps;   /* [I] */
-   /*
-* This function is optional.  It will be called whenever the
-* timecounter is rewound, and is intended to check for PPS
-* events.  Normal hardware does not need it but timecounters
-* which latch PPS in hardware (like sys/pci/xrpu.c) do.
 */
u_int   tc_counter_mask;/* [I] */
/* This mask should mask off any unimplemented bits. */
Index: sys/kern/kern_tc.c
===
RCS file: /cvs/src/sys/kern/kern_tc.c,v
retrieving revision 1.81
diff -u -p -r1.81 kern_tc.c
--- sys/kern/kern_tc.c  13 Dec 2022 17:30:36 -  1.81
+++ sys/kern/kern_tc.c  31 Jan 2023 18:12:57 -
@@ -56,7 +56,6 @@ dummy_get_timecount(struct timecounter *
 
 static struct timecounter dummy_timecounter = {
.tc_get_timecount = dummy_get_timecount,
-   .tc_poll_pps = NULL,
.tc_counter_mask = ~0u,
.tc_frequency = 100,
.tc_name = "dummy",
@@ -707,19 +706,6 @@ tc_windup(struct bintime *new_boottime, 
naptime = th->th_naptime.sec;
th->th_offset = *new_offset;
}
-
-#ifdef notyet
-   /*
-* Hardware latching timecounters may not generate interrupts on
-* PPS events, so instead we poll them.  There is a finite risk that
-* the hardware might capture a count which is later than the one we
-* got above, and therefore possibly in the next NTP second which might
-* have a different rate than the current NTP second.  It doesn't
-* matter in practice.
-*/
-   if (tho->th_counter->tc_poll_pps)
-   tho->th_counter->tc_poll_pps(tho->th_counter);
-#endif
 
/*
 * If changing the boot time or clock adjustment, do so before
Index: share/man/man9/tc_init.9
===
RCS file: /cvs/src/share/man/man9/tc_init.9,v
retrieving revision 1.10
diff -u -p -r1.10 tc_init.9
--- share/man/man9/tc_init.917 Jan 2023 10:10:11 -  1.10
+++ share/man/man9/tc_init.931 Jan 2023 18:12:57 -
@@ -46,7 +46,6 @@ structure:
 .Bd -literal -offset indent
 struct timecounter {
timecounter_get_t   *tc_get_timecount;
-   timecounter_pps_t   *tc_poll_pps;
u_int   tc_counter_mask;
u_int64_t   tc_frequency;
char*tc_name;
@@ -64,12 +63,6 @@ structure are described below.
 This function reads the counter.
 It is not required to mask any unimplemented bits out, as long as they
 are constant.
-.It Ft void Fn (*tc_poll_pps) "struct timecounter *"
-This function is optional and can be set to 

Re: hardclock: don't call statclock(), stathz is always non-zero

2023-01-31 Thread Scott Cheloha
On Mon, Jan 30, 2023 at 05:08:38PM +0100, Mark Kettenis wrote:
> > Date: Sat, 21 Jan 2023 17:02:48 -0600
> > From: Scott Cheloha 
> > 
> > All the platforms have switched to clockintr.
> > 
> > Let's start by isolating statclock() from hardclock().  stathz is now
> > always non-zero: statclock() must be called separately.  Update
> > several of the the stathz users to reflect that the value is always
> > non-zero.
> > 
> > This is a first step toward making hardclock and statclock into
> > schedulable entities.
> > 
> > ok?
> 
> If you are confident enough to start burning bridges, yes ok kettenis@
> 
> Maybe you want to add
> 
> KASSERT(stathz != 0);
> KASSERT(profhz != 0);
> 
> at the start of initclocks() just to be sure?
> 
> Either way is fine with me.

I thought about doing that, but those checks are done during
cpu_initclocks(), in clockintr_init():

60  void
61  clockintr_init(u_int flags)
62  {
63  KASSERT(CPU_IS_PRIMARY(curcpu()));
64  KASSERT(clockintr_flags == 0);
65  KASSERT(!ISSET(flags, ~CL_FLAG_MASK));
66  
67  KASSERT(hz > 0 && hz <= 10);
68  hardclock_period = 10 / hz;
69  
70  KASSERT(stathz >= 1 && stathz <= 10);

Checking them again might make intent more explicit...  still, I'm
leaning toward leaving them out.



macppc, powerpc64: dec_rearm: no need to disable interrupts

2023-01-28 Thread Scott Cheloha
On macppc and powerpc64, I don't think we need to disable interrupts
during dec_rearm().  ppc_mtdec() and mtdec() are just:

mtspr   dec,

or

mtdec   

which is atomic.

Tested on macpcc (my G4).  Kernel boots, system has been up for over
24 hours doing a parallel `make build`.

When I wrote dec_rearm() I think I was just imitating dec_trigger().
In that case, we *do* need to disable interrupts because we're doing
mtspr twice.

ok?

Index: macppc/macppc/clock.c
===
RCS file: /cvs/src/sys/arch/macppc/macppc/clock.c,v
retrieving revision 1.52
diff -u -p -r1.52 clock.c
--- macppc/macppc/clock.c   27 Jan 2023 22:13:48 -  1.52
+++ macppc/macppc/clock.c   28 Jan 2023 23:20:06 -
@@ -248,16 +248,13 @@ void
 dec_rearm(void *unused, uint64_t nsecs)
 {
uint32_t cycles;
-   int s;
 
if (nsecs > dec_nsec_max)
nsecs = dec_nsec_max;
cycles = (nsecs * dec_nsec_cycle_ratio) >> 32;
if (cycles > UINT32_MAX >> 1)
cycles = UINT32_MAX >> 1;
-   s = ppc_intr_disable();
ppc_mtdec(cycles);
-   ppc_intr_enable(s);
 }
 
 void
Index: powerpc64/powerpc64/clock.c
===
RCS file: /cvs/src/sys/arch/powerpc64/powerpc64/clock.c,v
retrieving revision 1.8
diff -u -p -r1.8 clock.c
--- powerpc64/powerpc64/clock.c 27 Jan 2023 22:14:43 -  1.8
+++ powerpc64/powerpc64/clock.c 28 Jan 2023 23:20:06 -
@@ -59,7 +59,6 @@ void  cpu_startclock(void);
 void
 dec_rearm(void *unused, uint64_t nsecs)
 {
-   u_long s;
uint32_t cycles;
 
if (nsecs > dec_nsec_max)
@@ -67,9 +66,7 @@ dec_rearm(void *unused, uint64_t nsecs)
cycles = (nsecs * dec_nsec_cycle_ratio) >> 32;
if (cycles > UINT32_MAX >> 1)
cycles = UINT32_MAX >> 1;
-   s = intr_disable();
mtdec(cycles);
-   intr_restore(s);
 }
 
 void



amd64, i386: set lapic timer mode, mask, divisor once

2023-01-27 Thread Scott Cheloha
mlarkin@ noted about a month or so ago that setting the lapic timer
mode, mask, and divisor every time we rearm it is unnecessary.  We
only need to configure those registers once during
lapic_timer_trigger().  After that, it is sufficient to set the ICR
when rearming the timer.

While here, add the missing intr_disable/intr_restore wrapper to
lapic_timer_trigger().  Writing multiple registers is not atomic, so
we need to disable interrupts for safety.  Setting the ICR during
lapic_timer_rearm() is atomic, so we don't need to disable interrupts
there.

ok?

Index: amd64/amd64/lapic.c
===
RCS file: /cvs/src/sys/arch/amd64/amd64/lapic.c,v
retrieving revision 1.65
diff -u -p -r1.65 lapic.c
--- amd64/amd64/lapic.c 10 Nov 2022 08:26:54 -  1.65
+++ amd64/amd64/lapic.c 27 Jan 2023 13:58:15 -
@@ -431,13 +431,17 @@ lapic_timer_rearm(void *unused, uint64_t
cycles = (nsecs * lapic_timer_nsec_cycle_ratio) >> 32;
if (cycles == 0)
cycles = 1;
-   lapic_timer_oneshot(0, cycles);
+   lapic_writereg(LAPIC_ICR_TIMER, cycles);
 }
 
 void
 lapic_timer_trigger(void *unused)
 {
+   u_long s;
+
+   s = intr_disable(); 
lapic_timer_oneshot(0, 1);
+   intr_restore(s);
 }
 
 /*
Index: i386/i386/lapic.c
===
RCS file: /cvs/src/sys/arch/i386/i386/lapic.c,v
retrieving revision 1.53
diff -u -p -r1.53 lapic.c
--- i386/i386/lapic.c   6 Dec 2022 01:56:44 -   1.53
+++ i386/i386/lapic.c   27 Jan 2023 13:58:15 -
@@ -268,13 +268,17 @@ lapic_timer_rearm(void *unused, uint64_t
cycles = (nsecs * lapic_timer_nsec_cycle_ratio) >> 32;
if (cycles == 0)
cycles = 1;
-   lapic_timer_oneshot(0, cycles);
+   i82489_writereg(LAPIC_ICR_TIMER, cycles);
 }
 
 void
 lapic_timer_trigger(void *unused)
 {
+   u_long s;
+
+   s = intr_disable();
lapic_timer_oneshot(0, 1);
+   intr_restore(s);
 }
 
 /*



dmtimer(4), macppc, powerpc64, riscv64: init stathz, profhz like other platforms

2023-01-26 Thread Scott Cheloha
Almost every platform initializes stathz and profhz like this:

stathz = hz;
profhz = stathz * 10;

This patch brings a few stragglers in line with everyone else.

This does not change stathz and profhz on the listed platforms: hz is
100 in every case.

ok?

Index: macppc/macppc/clock.c
===
RCS file: /cvs/src/sys/arch/macppc/macppc/clock.c,v
retrieving revision 1.51
diff -u -p -r1.51 clock.c
--- macppc/macppc/clock.c   29 Nov 2022 00:58:05 -  1.51
+++ macppc/macppc/clock.c   27 Jan 2023 03:20:17 -
@@ -196,8 +196,8 @@ cpu_initclocks(void)
 
intrstate = ppc_intr_disable();
 
-   stathz = 100;
-   profhz = 1000; /* must be a multiple of stathz */
+   stathz = hz;
+   profhz = stathz * 10;
clockintr_init(CL_RNDSTAT);
 
dec_nsec_cycle_ratio = ticks_per_sec * (1ULL << 32) / 10;
Index: powerpc64/powerpc64/clock.c
===
RCS file: /cvs/src/sys/arch/powerpc64/powerpc64/clock.c,v
retrieving revision 1.7
diff -u -p -r1.7 clock.c
--- powerpc64/powerpc64/clock.c 29 Nov 2022 01:04:44 -  1.7
+++ powerpc64/powerpc64/clock.c 27 Jan 2023 03:20:17 -
@@ -98,8 +98,8 @@ cpu_initclocks(void)
dec_nsec_cycle_ratio = tb_freq * (1ULL << 32) / 10;
dec_nsec_max = UINT64_MAX / dec_nsec_cycle_ratio;
 
-   stathz = 100;
-   profhz = 1000; /* must be a multiple of stathz */
+   stathz = hz;
+   profhz = stathz * 10;
clockintr_init(CL_RNDSTAT);
 
evcount_attach(_count, "clock", NULL);
Index: riscv64/riscv64/clock.c
===
RCS file: /cvs/src/sys/arch/riscv64/riscv64/clock.c,v
retrieving revision 1.7
diff -u -p -r1.7 clock.c
--- riscv64/riscv64/clock.c 3 Dec 2022 15:03:49 -   1.7
+++ riscv64/riscv64/clock.c 27 Jan 2023 03:20:17 -
@@ -92,8 +92,8 @@ cpu_initclocks(void)
timer_nsec_cycle_ratio = tb_freq * (1ULL << 32) / 10;
timer_nsec_max = UINT64_MAX / timer_nsec_cycle_ratio;
 
-   stathz = 100;
-   profhz = 1000; /* must be a multiple of stathz */
+   stathz = hz;
+   profhz = stathz * 10;
clockintr_init(CL_RNDSTAT);
 
riscv_intc_intr_establish(IRQ_TIMER_SUPERVISOR, 0,
Index: armv7/omap/dmtimer.c
===
RCS file: /cvs/src/sys/arch/armv7/omap/dmtimer.c,v
retrieving revision 1.16
diff -u -p -r1.16 dmtimer.c
--- armv7/omap/dmtimer.c17 Jan 2023 02:32:07 -  1.16
+++ armv7/omap/dmtimer.c27 Jan 2023 03:20:18 -
@@ -230,8 +230,8 @@ dmtimer_cpu_initclocks(void)
 {
struct dmtimer_softc*sc = dmtimer_cd.cd_devs[1];
 
-   stathz = 100;
-   profhz = 1000;
+   stathz = hz;
+   profhz = stathz * 10;
clockintr_init(CL_RNDSTAT);
 
sc->sc_ticks_per_second = TIMER_FREQUENCY; /* 32768 */



Re: gptimer(4): switch to clockintr

2023-01-25 Thread Scott Cheloha
On Tue, Jan 24, 2023 at 10:32:49PM +0100, Mark Kettenis wrote:
> > Date: Tue, 24 Jan 2023 20:57:48 +0100
> > From: Patrick Wildt 
> > 
> > Am Mon, Jan 23, 2023 at 04:34:27PM -0600 schrieb Scott Cheloha:
> > > Whoops, missed one.  This is the fifth and (I think) last armv7 clock
> > > interrupt driver that needs to switch to clockintr.  gptimer(4) is
> > > nearly identical to dmtimer(4).
> > > 
> > > Notable changes:
> > > 
> > > - Switch stathz from 128 to hz.
> > > - Switch profhz from 1024 to (stathz * 10).
> > > 
> > > Everything else in the patch is just normal clockintr switching stuff
> > > or duplicated from the dmtimer(4) patch.
> > > 
> > > jca@ has compile-tested this.
> > > 
> > > I would appreciate a test to confirm that the GENERIC boots.  I don't
> > > think we need to do a full release build.
> > > 
> > > ... if nobody knows where to find a board with gptimer(4), I am
> > > looking for permission to just commit this as-is.  I cannot find any
> > > entries in the dmesg mails of any machines with gptimer(4).  kettenis@
> > > was uncertain whether we actually support any of the machines that have
> > > this clock.
> > > 
> > > Preferences?  ok?
> > 
> > Should we just get rid of it?  It's only used on OMAP3, which was
> > already outdated when I started my ARMv7 endeavour a decade ago.
> > I never had that machine, drahn@ might have one in the attic.
> > 
> > The relevant platforms were the Pandaboard (OMAP4) and the BeagleBone
> > Black (AM335x).  But the OMAP3, did this thing ever work?
> 
> Might as well commit this such that we have it in the attic just in
> case someone wants to revive OMAP3?

Patch committed.



Re: hardclock: don't call statclock(), stathz is always non-zero

2023-01-23 Thread Scott Cheloha
On Sat, Jan 21, 2023 at 05:02:49PM -0600, Scott Cheloha wrote:
> All the platforms have switched to clockintr.
> 
> Let's start by isolating statclock() from hardclock().  stathz is now
> always non-zero: statclock() must be called separately.  Update
> several of the the stathz users to reflect that the value is always
> non-zero.
> 
> This is a first step toward making hardclock and statclock into
> schedulable entities.

Here are a few more.  The big comment in kern_clock.c needs to be
updated, as does the hardclock.9 manpage, but I think that those can
wait until the dust has settled.  More stuff is going to move around
in this area.

ok?

Index: kern_clock.c
===
RCS file: /cvs/src/sys/kern/kern_clock.c,v
retrieving revision 1.105
diff -u -p -r1.105 kern_clock.c
--- kern_clock.c14 Aug 2022 01:58:27 -  1.105
+++ kern_clock.c24 Jan 2023 00:03:06 -
@@ -98,8 +98,6 @@ volatile unsigned long jiffies;   /* XXX 
 void
 initclocks(void)
 {
-   int i;
-
ticks = INT_MAX - (15 * 60 * hz);
jiffies = ULONG_MAX - (10 * 60 * hz);
 
@@ -111,12 +109,9 @@ initclocks(void)
cpu_initclocks();
 
/*
-* Compute profhz/stathz, and fix profhz if needed.
+* Compute profhz/stathz.
 */
-   i = stathz ? stathz : hz;
-   if (profhz == 0)
-   profhz = i;
-   psratio = profhz / i;
+   psratio = profhz / stathz;
 
inittimecounter();
 }
@@ -158,12 +153,6 @@ hardclock(struct clockframe *frame)
}
}
 
-   /*
-* If no separate statistics clock is available, run it from here.
-*/
-   if (stathz == 0)
-   statclock(frame);
-
if (--ci->ci_schedstate.spc_rrticks <= 0)
roundrobin(ci);
 
@@ -268,7 +257,7 @@ startprofclock(struct process *pr)
 
if ((pr->ps_flags & PS_PROFIL) == 0) {
atomic_setbits_int(>ps_flags, PS_PROFIL);
-   if (++profprocs == 1 && stathz != 0) {
+   if (++profprocs == 1) {
s = splstatclock();
psdiv = pscnt = psratio;
setstatclockrate(profhz);
@@ -287,7 +276,7 @@ stopprofclock(struct process *pr)
 
if (pr->ps_flags & PS_PROFIL) {
atomic_clearbits_int(>ps_flags, PS_PROFIL);
-   if (--profprocs == 0 && stathz != 0) {
+   if (--profprocs == 0) {
s = splstatclock();
psdiv = pscnt = 1;
setstatclockrate(stathz);
@@ -415,6 +404,6 @@ sysctl_clockrate(char *where, size_t *si
clkinfo.tick = tick;
clkinfo.hz = hz;
clkinfo.profhz = profhz;
-   clkinfo.stathz = stathz ? stathz : hz;
+   clkinfo.stathz = stathz;
return (sysctl_rdstruct(where, sizep, newp, , sizeof(clkinfo)));
 }
Index: kern_resource.c
===
RCS file: /cvs/src/sys/kern/kern_resource.c,v
retrieving revision 1.76
diff -u -p -r1.76 kern_resource.c
--- kern_resource.c 17 Nov 2022 18:53:13 -  1.76
+++ kern_resource.c 24 Jan 2023 00:03:06 -
@@ -410,7 +410,6 @@ calctsru(struct tusage *tup, struct time
 struct timespec *ip)
 {
u_quad_t st, ut, it;
-   int freq;
 
st = tup->tu_sticks;
ut = tup->tu_uticks;
@@ -424,16 +423,14 @@ calctsru(struct tusage *tup, struct time
return;
}
 
-   freq = stathz ? stathz : hz;
-
-   st = st * 10 / freq;
+   st = st * 10 / stathz;
sp->tv_sec = st / 10;
sp->tv_nsec = st % 10;
-   ut = ut * 10 / freq;
+   ut = ut * 10 / stathz;
up->tv_sec = ut / 10;
up->tv_nsec = ut % 10;
if (ip != NULL) {
-   it = it * 10 / freq;
+   it = it * 10 / stathz;
ip->tv_sec = it / 10;
ip->tv_nsec = it % 10;
}
Index: sched_bsd.c
===
RCS file: /cvs/src/sys/kern/sched_bsd.c,v
retrieving revision 1.73
diff -u -p -r1.73 sched_bsd.c
--- sched_bsd.c 5 Dec 2022 23:18:37 -   1.73
+++ sched_bsd.c 24 Jan 2023 00:03:06 -
@@ -189,16 +189,6 @@ schedcpu(void *arg)
struct proc *p;
int s;
unsigned int newcpu;
-   int phz;
-
-   /*
-* If we have a statistics clock, use that to calculate CPU
-* time, otherwise revert to using the profiling clock (which,
-* in turn, defaults to hz if there is no separate profiling
-* clock available)
-*/
-   phz = stathz ? stathz : profhz;
-   KASSERT(phz);
 
LIST_FOREACH(p, , p_list) {
/*

gptimer(4): switch to clockintr

2023-01-23 Thread Scott Cheloha
Whoops, missed one.  This is the fifth and (I think) last armv7 clock
interrupt driver that needs to switch to clockintr.  gptimer(4) is
nearly identical to dmtimer(4).

Notable changes:

- Switch stathz from 128 to hz.
- Switch profhz from 1024 to (stathz * 10).

Everything else in the patch is just normal clockintr switching stuff
or duplicated from the dmtimer(4) patch.

jca@ has compile-tested this.

I would appreciate a test to confirm that the GENERIC boots.  I don't
think we need to do a full release build.

... if nobody knows where to find a board with gptimer(4), I am
looking for permission to just commit this as-is.  I cannot find any
entries in the dmesg mails of any machines with gptimer(4).  kettenis@
was uncertain whether we actually support any of the machines that have
this clock.

Preferences?  ok?

Index: gptimer.c
===
RCS file: /cvs/src/sys/arch/armv7/omap/gptimer.c,v
retrieving revision 1.17
diff -u -p -r1.17 gptimer.c
--- gptimer.c   22 Jan 2023 18:36:38 -  1.17
+++ gptimer.c   22 Jan 2023 23:52:32 -
@@ -23,9 +23,11 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -93,14 +95,12 @@
 
 #define TIMER_FREQUENCY32768   /* 32kHz is used, 
selectable */
 
-static struct evcount clk_count;
-static struct evcount stat_count;
-
 void gptimer_attach(struct device *parent, struct device *self, void *args);
 int gptimer_intr(void *frame);
 void gptimer_wait(int reg);
 void gptimer_cpu_initclocks(void);
 void gptimer_delay(u_int);
+void gptimer_reset_tisr(void);
 void gptimer_setstatclockrate(int newhz);
 
 bus_space_tag_t gptimer_iot;
@@ -120,13 +120,16 @@ static struct timecounter gptimer_timeco
.tc_user = 0,
 };
 
-volatile u_int32_t nexttickevent;
-volatile u_int32_t nextstatevent;
-u_int32_t  ticks_per_second;
-u_int32_t  ticks_per_intr;
-u_int32_t  ticks_err_cnt;
-u_int32_t  ticks_err_sum;
-u_int32_t  statvar, statmin;
+uint64_t gptimer_nsec_cycle_ratio;
+uint64_t gptimer_nsec_max;
+
+void gptimer_rearm(void *, uint64_t);
+void gptimer_trigger(void *);
+
+const struct intrclock gptimer_intrclock = {
+   .ic_rearm = gptimer_rearm,
+   .ic_trigger = gptimer_trigger
+};
 
 const struct cfattach  gptimer_ca = {
sizeof (struct device), NULL, gptimer_attach
@@ -177,98 +180,10 @@ gptimer_attach(struct device *parent, st
gptimer_setstatclockrate, NULL);
 }
 
-/*
- * See comment in arm/xscale/i80321_clock.c
- *
- * counter is count up, but with autoreload timers it is not possible
- * to detect how many  interrupts passed while interrupts were blocked.
- * also it is not possible to atomically add to the register
- * get get it to precisely fire at a non-fixed interval.
- *
- * To work around this two timers are used, GPT1 is used as a reference
- * clock without reload , however we just ignore the interrupt it
- * would (may?) generate.
- *
- * Internally this keeps track of when the next timer should fire
- * and based on that time and the current value of the reference
- * clock a number is written into the timer count register to schedule
- * the next event.
- */
-
 int
 gptimer_intr(void *frame)
 {
-   u_int32_t now, r;
-   u_int32_t nextevent, duration;
-
-   /* clear interrupt */
-   now = bus_space_read_4(gptimer_iot, gptimer_ioh1, GP_TCRR);
-
-   while ((int32_t) (nexttickevent - now) < 0) {
-   nexttickevent += ticks_per_intr;
-   ticks_err_sum += ticks_err_cnt;
-#if 0
-   if (ticks_err_sum  > hz) {
-   u_int32_t match_error;
-   match_error = ticks_err_sum / hz
-   ticks_err_sum -= (match_error * hz);
-   }
-#else
-   /* looping a few times is faster than divide */
-   while (ticks_err_sum  > hz) {
-   nexttickevent += 1;
-   ticks_err_sum -= hz;
-   }
-#endif
-   clk_count.ec_count++;
-   hardclock(frame);
-   }
-   while ((int32_t) (nextstatevent - now) < 0) {
-   do {
-   r = random() & (statvar -1);
-   } while (r == 0); /* random == 0 not allowed */
-   nextstatevent += statmin + r;
-   /* XXX - correct nextstatevent? */
-   stat_count.ec_count++;
-   statclock(frame);
-   }
-   if ((nexttickevent - now) < (nextstatevent - now))
-nextevent = nexttickevent;
-else
-nextevent = nextstatevent;
-
-/* XXX */
-   duration = nextevent -
-   bus_space_read_4(gptimer_iot, gptimer_ioh1, GP_TCRR);
-#if 0
-   printf("duration 0x%x %x %x\n", nextevent -
-   bus_space_read_4(gptimer_iot, gptimer_ioh1, GP_TCRR),
-   bus_space_read_4(gptimer_iot, gptimer_ioh0, GP_TCRR),
-  

hardclock: don't call statclock(), stathz is always non-zero

2023-01-21 Thread Scott Cheloha
All the platforms have switched to clockintr.

Let's start by isolating statclock() from hardclock().  stathz is now
always non-zero: statclock() must be called separately.  Update
several of the the stathz users to reflect that the value is always
non-zero.

This is a first step toward making hardclock and statclock into
schedulable entities.

ok?

Index: kern_clock.c
===
RCS file: /cvs/src/sys/kern/kern_clock.c,v
retrieving revision 1.105
diff -u -p -r1.105 kern_clock.c
--- kern_clock.c14 Aug 2022 01:58:27 -  1.105
+++ kern_clock.c21 Jan 2023 22:59:34 -
@@ -98,8 +98,6 @@ volatile unsigned long jiffies;   /* XXX 
 void
 initclocks(void)
 {
-   int i;
-
ticks = INT_MAX - (15 * 60 * hz);
jiffies = ULONG_MAX - (10 * 60 * hz);
 
@@ -111,12 +109,9 @@ initclocks(void)
cpu_initclocks();
 
/*
-* Compute profhz/stathz, and fix profhz if needed.
+* Compute profhz/stathz.
 */
-   i = stathz ? stathz : hz;
-   if (profhz == 0)
-   profhz = i;
-   psratio = profhz / i;
+   psratio = profhz / stathz;
 
inittimecounter();
 }
@@ -158,12 +153,6 @@ hardclock(struct clockframe *frame)
}
}
 
-   /*
-* If no separate statistics clock is available, run it from here.
-*/
-   if (stathz == 0)
-   statclock(frame);
-
if (--ci->ci_schedstate.spc_rrticks <= 0)
roundrobin(ci);
 
@@ -268,7 +257,7 @@ startprofclock(struct process *pr)
 
if ((pr->ps_flags & PS_PROFIL) == 0) {
atomic_setbits_int(>ps_flags, PS_PROFIL);
-   if (++profprocs == 1 && stathz != 0) {
+   if (++profprocs == 1) {
s = splstatclock();
psdiv = pscnt = psratio;
setstatclockrate(profhz);
@@ -287,7 +276,7 @@ stopprofclock(struct process *pr)
 
if (pr->ps_flags & PS_PROFIL) {
atomic_clearbits_int(>ps_flags, PS_PROFIL);
-   if (--profprocs == 0 && stathz != 0) {
+   if (--profprocs == 0) {
s = splstatclock();
psdiv = pscnt = 1;
setstatclockrate(stathz);
@@ -415,6 +404,6 @@ sysctl_clockrate(char *where, size_t *si
clkinfo.tick = tick;
clkinfo.hz = hz;
clkinfo.profhz = profhz;
-   clkinfo.stathz = stathz ? stathz : hz;
+   clkinfo.stathz = stathz;
return (sysctl_rdstruct(where, sizep, newp, , sizeof(clkinfo)));
 }
Index: kern_time.c
===
RCS file: /cvs/src/sys/kern/kern_time.c,v
retrieving revision 1.161
diff -u -p -r1.161 kern_time.c
--- kern_time.c 2 Jan 2023 23:09:48 -   1.161
+++ kern_time.c 21 Jan 2023 22:59:34 -
@@ -218,10 +218,9 @@ sys_clock_getres(struct proc *p, void *v
struct timespec ts;
struct proc *q;
u_int64_t scale;
-   int error = 0, realstathz;
+   int error = 0;
 
memset(, 0, sizeof(ts));
-   realstathz = (stathz == 0) ? hz : stathz;
clock_id = SCARG(uap, clock_id);
 
switch (clock_id) {
@@ -238,7 +237,7 @@ sys_clock_getres(struct proc *p, void *v
break;
case CLOCK_PROCESS_CPUTIME_ID:
case CLOCK_THREAD_CPUTIME_ID:
-   ts.tv_nsec = 10 / realstathz;
+   ts.tv_nsec = 10 / stathz;
break;
default:
/* check for clock from pthread_getcpuclockid() */
@@ -248,7 +247,7 @@ sys_clock_getres(struct proc *p, void *v
if (q == NULL)
error = ESRCH;
else
-   ts.tv_nsec = 10 / realstathz;
+   ts.tv_nsec = 10 / stathz;
KERNEL_UNLOCK();
} else
error = EINVAL;



[v2] dmtimer(4): switch to clockintr

2023-01-10 Thread Scott Cheloha
This patch switches dmtimer(4) over to use the clockintr subsystem.
This is v2.  v1 is here:

https://marc.info/?l=openbsd-tech=167060320326851=2

This revision fixes a bug in dmtimer_rearm() that kept the kernel from
booting with the v1 patch: This revision also switches stathz from 128
to 100 and profhz from 1024 to 1000 to align dmtimer(4) with other
drivers and platforms.

I am looking for a tester to build a release atop kernel with this
patch and then upgrade from the resulting bsd.rd.  This will
demonstrate that the patched system is stable enough to self-host.

This is the last armv7 driver that needs testing before we can switch
armv7 over to clockintr.

Index: arm/include/_types.h
===
RCS file: /cvs/src/sys/arch/arm/include/_types.h,v
retrieving revision 1.19
diff -u -p -r1.19 _types.h
--- arm/include/_types.h5 Mar 2018 01:15:25 -   1.19
+++ arm/include/_types.h11 Jan 2023 01:05:39 -
@@ -35,6 +35,8 @@
 #ifndef _ARM__TYPES_H_
 #define _ARM__TYPES_H_
 
+#define__HAVE_CLOCKINTR
+
 #if defined(_KERNEL)
 typedef struct label_t {
long val[11];
Index: arm/include/cpu.h
===
RCS file: /cvs/src/sys/arch/arm/include/cpu.h,v
retrieving revision 1.61
diff -u -p -r1.61 cpu.h
--- arm/include/cpu.h   6 Jul 2021 09:34:06 -   1.61
+++ arm/include/cpu.h   11 Jan 2023 01:05:39 -
@@ -149,6 +149,7 @@ voidarm32_vector_init(vaddr_t, int);
  * Per-CPU information.  For now we assume one CPU.
  */
 
+#include 
 #include 
 #include 
 #include 
@@ -198,7 +199,7 @@ struct cpu_info {
 #ifdef GPROF
struct gmonparam *ci_gmon;
 #endif
-
+   struct clockintr_queue  ci_queue;
charci_panicbuf[512];
 };
 
Index: armv7/omap/dmtimer.c
===
RCS file: /cvs/src/sys/arch/armv7/omap/dmtimer.c,v
retrieving revision 1.15
diff -u -p -r1.15 dmtimer.c
--- armv7/omap/dmtimer.c21 Feb 2022 10:57:58 -  1.15
+++ armv7/omap/dmtimer.c11 Jan 2023 01:05:39 -
@@ -25,8 +25,10 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -95,11 +97,9 @@
 #define TIMER_FREQUENCY32768   /* 32kHz is used, 
selectable */
 #define MAX_TIMERS 2
 
-static struct evcount clk_count;
-static struct evcount stat_count;
-
 void dmtimer_attach(struct device *parent, struct device *self, void *args);
 int dmtimer_intr(void *frame);
+void dmtimer_reset_tisr(void);
 void dmtimer_wait(int reg);
 void dmtimer_cpu_initclocks(void);
 void dmtimer_delay(u_int);
@@ -117,6 +117,14 @@ static struct timecounter dmtimer_timeco
.tc_priv = NULL,
 };
 
+void dmtimer_rearm(void *, uint64_t);
+void dmtimer_trigger(void *);
+
+struct intrclock dmtimer_intrclock = {
+   .ic_rearm = dmtimer_rearm,
+   .ic_trigger = dmtimer_trigger
+};
+
 bus_space_handle_t dmtimer_ioh0;
 int dmtimer_irq = 0;
 
@@ -126,13 +134,8 @@ struct dmtimer_softc {
bus_space_handle_t  sc_ioh[MAX_TIMERS];
u_int32_t   sc_irq;
u_int32_t   sc_ticks_per_second;
-   u_int32_t   sc_ticks_per_intr;
-   u_int32_t   sc_ticks_err_cnt;
-   u_int32_t   sc_ticks_err_sum;
-   u_int32_t   sc_statvar;
-   u_int32_t   sc_statmin;
-   u_int32_t   sc_nexttickevent;
-   u_int32_t   sc_nextstatevent;
+   u_int64_t   sc_nsec_cycle_ratio;
+   u_int64_t   sc_nsec_max;
 };
 
 const struct cfattach  dmtimer_ca = {
@@ -208,82 +211,11 @@ dmtimer_attach(struct device *parent, st
printf(" rev %d.%d\n", (rev & DM_TIDR_MAJOR) >> 8, rev & DM_TIDR_MINOR);
 }
 
-/*
- * See comment in arm/xscale/i80321_clock.c
- *
- * Counter is count up, but with autoreload timers it is not possible
- * to detect how many interrupts passed while interrupts were blocked.
- * Also it is not possible to atomically add to the register.
- *
- * To work around this two timers are used, one is used as a reference
- * clock without reload, however we just disable the interrupt it
- * could generate.
- *
- * Internally this keeps track of when the next timer should fire
- * and based on that time and the current value of the reference
- * clock a number is written into the timer count register to schedule
- * the next event.
- */
-
 int
 dmtimer_intr(void *frame)
 {
-   struct dmtimer_softc*sc = dmtimer_cd.cd_devs[1];
-   u_int32_t   now, r, nextevent;
-   int32_t duration;
-
-   now = bus_space_read_4(sc->sc_iot, sc->sc_ioh[1], DM_TCRR);
-
-   while ((int32_t) (sc->sc_nexttickevent - now) <= 0) {
-   sc->sc_nexttickevent += sc->sc_ticks_per_intr;
-   sc->sc_ticks_err_sum += 

Re: vmm: mask WAITPKG cpuid feature to hide TPAUSE

2023-01-08 Thread Scott Cheloha
On Sun, Jan 08, 2023 at 07:20:46PM -0500, Scott Cheloha wrote:
> On Sun, Jan 08, 2023 at 03:09:44PM -0500, Dave Voutila wrote:
> > 
> > Philip Guenther  writes:
> > 
> > > On Sat, Jan 7, 2023 at 11:04 AM Dave Voutila  wrote:
> > >
> > >  Bringing this to tech@ to increase my chance of someone testing my
> > >  diff.
> > >
> > >  As reported in this thread on misc@ [1], I believe newer Intel hardware
> > >  may be experiencing issues hosting Linux guests under vmm/vmd. It looks
> > >  like there are some newer instructions Intel added (TPAUSE specifically)
> > >  that also involve some new MSR(s).
> > >
> > >  I don't have 12th gen Intel hardware to test this on (I think that's
> > >  Alder Lake). I'd like to mask this feature from vmm guests since it's
> > >  related to an MSR we don't yet pass through or emulate and has to do
> > >  with the TSC (which has it's own challenges in vmm).
> > >
> > >  For someone testing, you should be able to grab an Alpine Linux iso
> > >  (-virt flavor) and boot it with vmd with the diff. (Without it should
> > >  "hang" and spike CPU or just die.) Also check that WAITPKG shows up in
> > >  your dmesg on the cpu feature output.
> > >
> > > This seem like it'll obviously work, but I guess it seems to me that this 
> > > "opt-out" approach is generally
> > > unsafe/unstable and vmd should consider actively switching to "opt-in" on 
> > > all these CPUID feature bits.  I mean,
> > > what bits are defined in the SEFF first-leaf EDX that _do_ work with vmd?
> > >
> > 
> > Great point (I think you mean ECX). Here's an updated diff that flips it
> > to a whitelist so Intel/AMD don't burn me with these new bits in the
> > future. This better?
> 
> I don't mean to fearmonger, but doesn't this open vmm(4) and the wider
> kernel up to risk from all future "unknown unknowns"?  We're basically
> saying the guest can use any feature passed through from the host
> before any developer has even glanced at the documentation for the new
> bit, right?
> 
> I'm not saying the "opt-out" approach shown in your patch is even
> *likely* to be a problem.  But the "opt-in" approach doesn't seem
> unreasonably conservative to me.  When something is widely used enough
> to cause a problem, we add the bit.
> 
> Or maybe every few months we ought to check for a new x86-64 ISA pdf
> from Intel, since they seem to be the ones expanding the feature set.

Whoops, ignore this, I misunderstood guenther@'s idea and dv@'s patch.



Re: vmm: mask WAITPKG cpuid feature to hide TPAUSE

2023-01-08 Thread Scott Cheloha
On Sun, Jan 08, 2023 at 03:09:44PM -0500, Dave Voutila wrote:
> 
> Philip Guenther  writes:
> 
> > On Sat, Jan 7, 2023 at 11:04 AM Dave Voutila  wrote:
> >
> >  Bringing this to tech@ to increase my chance of someone testing my
> >  diff.
> >
> >  As reported in this thread on misc@ [1], I believe newer Intel hardware
> >  may be experiencing issues hosting Linux guests under vmm/vmd. It looks
> >  like there are some newer instructions Intel added (TPAUSE specifically)
> >  that also involve some new MSR(s).
> >
> >  I don't have 12th gen Intel hardware to test this on (I think that's
> >  Alder Lake). I'd like to mask this feature from vmm guests since it's
> >  related to an MSR we don't yet pass through or emulate and has to do
> >  with the TSC (which has it's own challenges in vmm).
> >
> >  For someone testing, you should be able to grab an Alpine Linux iso
> >  (-virt flavor) and boot it with vmd with the diff. (Without it should
> >  "hang" and spike CPU or just die.) Also check that WAITPKG shows up in
> >  your dmesg on the cpu feature output.
> >
> > This seem like it'll obviously work, but I guess it seems to me that this 
> > "opt-out" approach is generally
> > unsafe/unstable and vmd should consider actively switching to "opt-in" on 
> > all these CPUID feature bits.  I mean,
> > what bits are defined in the SEFF first-leaf EDX that _do_ work with vmd?
> >
> 
> Great point (I think you mean ECX). Here's an updated diff that flips it
> to a whitelist so Intel/AMD don't burn me with these new bits in the
> future. This better?

I don't mean to fearmonger, but doesn't this open vmm(4) and the wider
kernel up to risk from all future "unknown unknowns"?  We're basically
saying the guest can use any feature passed through from the host
before any developer has even glanced at the documentation for the new
bit, right?

I'm not saying the "opt-out" approach shown in your patch is even
*likely* to be a problem.  But the "opt-in" approach doesn't seem
unreasonably conservative to me.  When something is widely used enough
to cause a problem, we add the bit.

Or maybe every few months we ought to check for a new x86-64 ISA pdf
from Intel, since they seem to be the ones expanding the feature set.



[v3] sparc64: switch to clockintr

2023-01-08 Thread Scott Cheloha
Good news!

I have tested this revision in a T4-2 LDOM with help from kn@.  Booted
the patched kernel, built a release, booted the resulting bsd.rd,
upgraded, booted into the upgrade kernel.  The whole cycle.  It all
worked.

In this revision I believe I am *now* using send_softint() correctly,
but need confirmation.  We are also now correctly masking the
comparison register.  Otherwise, this patch is equivalent to v2:

https://marc.info/?l=openbsd-tech=167287772220176=2

miod:   Could you confirm that this works on USI (%TICK) and USIIe
(PCI-bridge %STICK)?  I have tested it, so I am pretty
confident this one isn't a waste of your time.

Assuming it works on USI and USIIe, are we all ok?

Index: sys/arch/sparc64/include/cpu.h
===
RCS file: /cvs/src/sys/arch/sparc64/include/cpu.h,v
retrieving revision 1.100
diff -u -p -r1.100 cpu.h
--- sys/arch/sparc64/include/cpu.h  22 Oct 2022 20:09:41 -  1.100
+++ sys/arch/sparc64/include/cpu.h  8 Jan 2023 23:16:30 -
@@ -78,6 +78,7 @@
 #include 
 #include 
 
+#include 
 #include 
 #include 
 
@@ -129,7 +130,7 @@ struct cpu_info {
int ci_want_resched;
int ci_handled_intr_level;
void*ci_intrpending[16][8];
-   u_int64_t   ci_tick;
+   struct clockintr_queue  ci_queue;
struct intrhand ci_tickintr;
 
volatile intci_ddb_paused;
Index: sys/arch/sparc64/include/_types.h
===
RCS file: /cvs/src/sys/arch/sparc64/include/_types.h,v
retrieving revision 1.23
diff -u -p -r1.23 _types.h
--- sys/arch/sparc64/include/_types.h   5 Mar 2018 01:15:25 -   1.23
+++ sys/arch/sparc64/include/_types.h   8 Jan 2023 23:16:31 -
@@ -35,6 +35,8 @@
 #ifndef _MACHINE__TYPES_H_
 #define _MACHINE__TYPES_H_
 
+#define __HAVE_CLOCKINTR
+
 #if defined(_KERNEL)
 typedef struct label_t {
long val[2];
Index: sys/arch/sparc64/sparc64/clock.c
===
RCS file: /cvs/src/sys/arch/sparc64/sparc64/clock.c,v
retrieving revision 1.74
diff -u -p -r1.74 clock.c
--- sys/arch/sparc64/sparc64/clock.c29 Dec 2022 22:44:23 -  1.74
+++ sys/arch/sparc64/sparc64/clock.c8 Jan 2023 23:16:31 -
@@ -70,10 +70,12 @@
 #include 
 #include 
 #include 
+#include 
 #ifdef GPROF
 #include 
 #endif
 #include 
+#include 
 #include 
 #include 
 
@@ -132,37 +134,53 @@ struct timecounter sys_tick_timecounter 
.tc_user = TC_SYS_TICK,
 };
 
-/*
- * Statistics clock interval and variance, in usec.  Variance must be a
- * power of two.  Since this gives us an even number, not an odd number,
- * we discard one case and compensate.  That is, a variance of 1024 would
- * give us offsets in [0..1023].  Instead, we take offsets in [1..1023].
- * This is symmetric about the point 512, or statvar/2, and thus averages
- * to that value (assuming uniform random numbers).
- */
-/* XXX fix comment to match value */
-int statvar = 8192;
-int statmin;   /* statclock interval - 1/2*variance */
-
-static long tick_increment;
-
 void   tick_start(void);
 void   sys_tick_start(void);
 void   stick_start(void);
 
-void   tick_rearm(uint64_t);
-void   sys_tick_rearm(uint64_t);
-void   stick_rearm(uint64_t);
-
 inttickintr(void *);
 intsys_tickintr(void *);
 intstickintr(void *);
-intschedintr(void *);
 
-static struct intrhand level10 = { clockintr };
+/* %TICK is at most a 63-bit counter. */
+#define TICK_COUNT_MASK 0x7fff
+
+uint64_t tick_nsec_cycle_ratio;
+uint64_t tick_nsec_max;
+
+void tick_rearm(void *, uint64_t);
+void tick_trigger(void *);
+
+const struct intrclock tick_intrclock = {
+   .ic_rearm = tick_rearm,
+   .ic_trigger = tick_trigger
+};
+
+/* %STICK is at most a 63-bit counter. */
+#define STICK_COUNT_MASK 0x7fff
+
+uint64_t sys_tick_nsec_cycle_ratio;
+uint64_t sys_tick_nsec_max;
+
+void sys_tick_rearm(void *, uint64_t);
+void sys_tick_trigger(void *);
+
+const struct intrclock sys_tick_intrclock = {
+   .ic_rearm = sys_tick_rearm,
+   .ic_trigger = sys_tick_trigger
+};
+
+void stick_rearm(void *, uint64_t);
+void stick_trigger(void *);
+
+const struct intrclock stick_intrclock = {
+   .ic_rearm = stick_rearm,
+   .ic_trigger = stick_trigger
+};
+
+void sparc64_raise_clockintr(void);
+
 static struct intrhand level0 = { tickintr };
-static struct intrhand level14 = { statintr };
-static struct intrhand schedint = { schedintr };
 
 /*
  * clock (eeprom) attaches at the sbus or the ebus (PCI)
@@ -464,6 +482,7 @@ timermatch(struct device *parent, void *
 static void
 timerattach(struct device *parent, struct device *self, void *aux)
 {
+#if 0
struct mainbus_attach_args *ma = aux;
u_int *va = ma->ma_address;

@@ -492,6 +511,8 @@ timerattach(struct 

[v2] sparc64: switch to clockintr

2023-01-04 Thread Scott Cheloha
Now that we have resolved the problems with the UltraSPARC IIe, let's
try the clockintr switch again.  For reference, here is v1:

https://marc.info/?l=openbsd-tech=166776418803680=2

The major difference in v2 is that we send_softint() if we miss when
setting the comparison register instead of spinning and retrying.  I
didn't do this in v1 because I didn't know it was an option.

This is untested.  I'm looking for tests on UltraSPARC I or UltraSPARC
II (%TICK), UltraSPARC IIe (Hummingbird PCI bridge %STICK), and
something newer (%STICK).

kettenis, kn: could one of you try this on a newer system?  miod@ only
has the older stuff.  mlarkin@ said he'd try it on the T4 but now he's
out of office.

Index: sys/arch/sparc64/include/cpu.h
===
RCS file: /cvs/src/sys/arch/sparc64/include/cpu.h,v
retrieving revision 1.100
diff -u -p -r1.100 cpu.h
--- sys/arch/sparc64/include/cpu.h  22 Oct 2022 20:09:41 -  1.100
+++ sys/arch/sparc64/include/cpu.h  5 Jan 2023 00:10:15 -
@@ -78,6 +78,7 @@
 #include 
 #include 
 
+#include 
 #include 
 #include 
 
@@ -129,7 +130,7 @@ struct cpu_info {
int ci_want_resched;
int ci_handled_intr_level;
void*ci_intrpending[16][8];
-   u_int64_t   ci_tick;
+   struct clockintr_queue  ci_queue;
struct intrhand ci_tickintr;
 
volatile intci_ddb_paused;
Index: sys/arch/sparc64/include/_types.h
===
RCS file: /cvs/src/sys/arch/sparc64/include/_types.h,v
retrieving revision 1.23
diff -u -p -r1.23 _types.h
--- sys/arch/sparc64/include/_types.h   5 Mar 2018 01:15:25 -   1.23
+++ sys/arch/sparc64/include/_types.h   5 Jan 2023 00:10:15 -
@@ -35,6 +35,8 @@
 #ifndef _MACHINE__TYPES_H_
 #define _MACHINE__TYPES_H_
 
+#define __HAVE_CLOCKINTR
+
 #if defined(_KERNEL)
 typedef struct label_t {
long val[2];
Index: sys/arch/sparc64/sparc64/clock.c
===
RCS file: /cvs/src/sys/arch/sparc64/sparc64/clock.c,v
retrieving revision 1.74
diff -u -p -r1.74 clock.c
--- sys/arch/sparc64/sparc64/clock.c29 Dec 2022 22:44:23 -  1.74
+++ sys/arch/sparc64/sparc64/clock.c5 Jan 2023 00:10:15 -
@@ -70,10 +70,12 @@
 #include 
 #include 
 #include 
+#include 
 #ifdef GPROF
 #include 
 #endif
 #include 
+#include 
 #include 
 #include 
 
@@ -132,37 +134,47 @@ struct timecounter sys_tick_timecounter 
.tc_user = TC_SYS_TICK,
 };
 
-/*
- * Statistics clock interval and variance, in usec.  Variance must be a
- * power of two.  Since this gives us an even number, not an odd number,
- * we discard one case and compensate.  That is, a variance of 1024 would
- * give us offsets in [0..1023].  Instead, we take offsets in [1..1023].
- * This is symmetric about the point 512, or statvar/2, and thus averages
- * to that value (assuming uniform random numbers).
- */
-/* XXX fix comment to match value */
-int statvar = 8192;
-int statmin;   /* statclock interval - 1/2*variance */
-
-static long tick_increment;
-
 void   tick_start(void);
 void   sys_tick_start(void);
 void   stick_start(void);
 
-void   tick_rearm(uint64_t);
-void   sys_tick_rearm(uint64_t);
-void   stick_rearm(uint64_t);
-
 inttickintr(void *);
 intsys_tickintr(void *);
 intstickintr(void *);
-intschedintr(void *);
 
-static struct intrhand level10 = { clockintr };
+uint64_t tick_nsec_cycle_ratio;
+uint64_t tick_nsec_max;
+
+void tick_rearm(void *, uint64_t);
+void tick_trigger(void *);
+
+const struct intrclock tick_intrclock = {
+   .ic_rearm = tick_rearm,
+   .ic_trigger = tick_trigger
+};
+
+uint64_t sys_tick_nsec_cycle_ratio;
+uint64_t sys_tick_nsec_max;
+
+void sys_tick_rearm(void *, uint64_t);
+void sys_tick_trigger(void *);
+
+const struct intrclock sys_tick_intrclock = {
+   .ic_rearm = sys_tick_rearm,
+   .ic_trigger = sys_tick_trigger
+};
+
+void stick_rearm(void *, uint64_t);
+void stick_trigger(void *);
+
+const struct intrclock stick_intrclock = {
+   .ic_rearm = stick_rearm,
+   .ic_trigger = stick_trigger
+};
+
+void sparc64_raise_clockintr(void);
+
 static struct intrhand level0 = { tickintr };
-static struct intrhand level14 = { statintr };
-static struct intrhand schedint = { schedintr };
 
 /*
  * clock (eeprom) attaches at the sbus or the ebus (PCI)
@@ -466,7 +478,7 @@ timerattach(struct device *parent, struc
 {
struct mainbus_attach_args *ma = aux;
u_int *va = ma->ma_address;
-   
+#if 0
/*
 * What we should have are 3 sets of registers that reside on
 * different parts of SYSIO or PSYCHO.  We'll use the prom
@@ -492,6 +504,8 @@ timerattach(struct device *parent, struc
 
printf(" ivec 0x%llx, 0x%llx\n", INTVEC(level10.ih_number),

Re: [v3] timeout.9: document missing interfaces, miscellaneous rewrites

2022-12-31 Thread Scott Cheloha
On Sat, Dec 31, 2022 at 11:31:21PM +0100, Ingo Schwarze wrote:
> Hi Scott,
> 
> in the NAME section, please put timeout_add_nsec after timeout_add_usec
> to agree with the order in the SYNOPSIS.

Whoops, done.

> In any case, please go ahead.  It appears jmc@ is developing a sore
> elbow from more than a year of medicine ball ping pong.  ;-)

These rewrites tend to gather a lot of moss, yeah :)

> The following are merely suggestions / observations / questions,
> not conditions.
> 
> Scott Cheloha wrote on Sat, Dec 31, 2022 at 11:22:22AM -0500:
> 
> > mvs@ is nudging me to realign the timeout.9 page with the state of the
> > kernel.
> > 
> > Here is my rewrite (again).
> > 
> > There are some bits that I want to rework.  The opening paragraph is
> > especially clickety-clackety.
> 
> The opening paragraph seems fine to me.  The second paragraph might
> be considered a bit awkward.  If you rename the struct timeout
> argument from *to to *timeout in all prototypes in the SYNOPSIS
> and everywhere in the text, you might be able to join the second and
> third paragraphs, for example as follows:
> 
> All state is encapsulated in a
> .Vt struct timeout
> allocated by the caller.
> The
> .Fn timeout_set
> function initializes the
> .Fa timeout
> before it can be passed to any of the other functions.
> When the timeout ...
> 
> That way, you get rid of the word "caller-allocated" and the
> parenthetic remark, and some of the later text may also simplify
> all by itself in a natural way.

I need to rethink this.  I'm going to go ahead as-is.  Maybe we can
improve it later.

> > Still, I think this is an improvement over what's in-tree.  And the
> > technical content should be complete and accurate, which is the most
> > important thing.
> > 
> > ok?
> > 
> > Index: timeout.9
> > ===
> > RCS file: /cvs/src/share/man/man9/timeout.9,v
> > retrieving revision 1.55
> > diff -u -p -r1.55 timeout.9
> > --- timeout.9   22 Jun 2022 14:10:49 -  1.55
> > +++ timeout.9   31 Dec 2022 16:19:27 -
> [...]
> > @@ -44,281 +46,370 @@
> [...]
> > -Once initialized, the
> > -.Fa to
> > -structure can be used repeatedly in
> > -.Fn timeout_add
> > -and
> > -.Fn timeout_del
> > -and does not need to be reinitialized unless
> > -the function called and/or its argument must change.
> 
> Are you deleting this information on purpose?

I think timeout reinitialization is a terrible idea.  A bug magnet.
An accident waiting to happen.

... I suppose we ought to include the information, though.  I have put
it back.

> [...]
> > -timeout in at least
> 
> Are you deleting the words "at least" on purpose, or should they be
> reinserted into this sentence:
> 
>   KCLOCK_NONE timeouts may be scheduled with the function timeout_add(),
>   which schedules the given timeout to execute after nticks hardclock(9)
>   ticks have elapsed.

Whoops, I have put it back, thanks.

> [...]
> > +The
> >  .Fn timeout_del_barrier
> > -is like
> > -.Fn timeout_del
> > -but it will wait until any current execution of the timeout has completed.
> > +function is similar to
> > +.Fn timeout_del ,
> > +except that it may block until any current execution of the given timeout
> > +has completed.
> 
> This appears to change the meaning.
> 
> The old text provides a guarantee that timeout_del_barrier(9)
> does not return before the timeout completes, if it is currently
> running.  The new wording no longer provides that guarantee.
> It that intentional?
> 
> Otherwise, s/may block/blocks/ ?  Or, if you think it should be
> more explicit, maybe something like:
> 
>   except that, if the timeout is currently executing,
>   .Fn timeout_del
>   blocks until execution of the timeout has completed.

I have tweaked it to indicate that it does not return until any
ongoing execution is completed.

> [...]
> > -Otherwise, the system will deadlock.
> > +Otherwise,
> > +the system will deadlock.
> 
> No need to change that.  The rule "break the line after a comma"
> is specific to the Linux man pages project, and i consider it excessive.
> We only follow "new sentence, new line".

Oh, sure.  That's useful.  I will keep that in mind in the future.
I assumed it was a rule, but I can't remember where I got it from.

--

Gonna go with the attached.

Index: timeout.9
===
RCS file: /cvs/src/share/man/man9/timeout.9,v
retrieving revision 1.55
diff -u -p -r1.55 timeout.9
---

Re: ssh: progress meter: prefer setitimer(2) to alarm(3)

2022-12-31 Thread Scott Cheloha
On Sat, Dec 31, 2022 at 07:05:20PM +0100, Mark Kettenis wrote:
> > Date: Sat, 31 Dec 2022 10:33:26 -0500
> > From: Scott Cheloha 
> > 
> > Here's another one.
> > 
> > The progress meter in scp(1) and sftp(1) updates periodically, once
> > per second.  But using alarm(3) to repeatedly rearm the signal causes
> > that update period to drift forward:
> > 
> > [...]
> > 
> > If we use setitimer(2), the update period does not drift:
> > 
> > [...]
> 
> Bad idea.  The setitimer(2) interface is marked as "OB XSI" in POSIX,
> which means that it is considerent "Obsolescent" and may be removed in
> a future version of POSIX.  Since we want ssh to be as portable as
> possible we shouldn't use it there.

Strict POSIX conformance is pretty portable.  But in practice, I think
of portability as "using stuff everyone has".

POSIX has been hunting the 4.1cBSD time interfaces for over two
decades, but no practical operating system is going to remove
gettimeofday() or setitimer() just because POSIX prefers
clock_gettime() and timer_settime() (which we don't have).

Even Windows went with Linux: WSL2 has Linux syscall compatibility,
and they have setitimer().  Defined here:

https://github.com/microsoft/WSL2-Linux-Kernel/blob/linux-msft-wsl-5.15.y/kernel/time/itimer.c#L332

> Especially for something that really is just a cosmetic "fix".

I admit that this is a minor gripe.  I can live without it.

I hope that setitimer's looming POSIX obsolescence is not a barrier to
my using it in other programs in the tree, though.



[v3] timeout.9: document missing interfaces, miscellaneous rewrites

2022-12-31 Thread Scott Cheloha
mvs@ is nudging me to realign the timeout.9 page with the state of the
kernel.

Here is my rewrite (again).

There are some bits that I want to rework.  The opening paragraph is
especially clickety-clackety.

Still, I think this is an improvement over what's in-tree.  And the
technical content should be complete and accurate, which is the most
important thing.

ok?

Index: timeout.9
===
RCS file: /cvs/src/share/man/man9/timeout.9,v
retrieving revision 1.55
diff -u -p -r1.55 timeout.9
--- timeout.9   22 Jun 2022 14:10:49 -  1.55
+++ timeout.9   31 Dec 2022 16:19:27 -
@@ -1,6 +1,7 @@
 .\"$OpenBSD: timeout.9,v 1.55 2022/06/22 14:10:49 visa Exp $
 .\"
 .\" Copyright (c) 2000 Artur Grabowski 
+.\" Copyright (c) 2021, 2022 Scott Cheloha 
 .\" All rights reserved.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
@@ -36,6 +37,7 @@
 .Nm timeout_add_nsec ,
 .Nm timeout_add_usec ,
 .Nm timeout_add_tv ,
+.Nm timeout_abs_ts ,
 .Nm timeout_del ,
 .Nm timeout_del_barrier ,
 .Nm timeout_barrier ,
@@ -44,281 +46,370 @@
 .Nm timeout_triggered ,
 .Nm TIMEOUT_INITIALIZER ,
 .Nm TIMEOUT_INITIALIZER_FLAGS
-.Nd execute a function after a specified period of time
+.Nd execute a function in the future
 .Sh SYNOPSIS
 .In sys/types.h
 .In sys/timeout.h
 .Ft void
-.Fn timeout_set "struct timeout *to" "void (*fn)(void *)" "void *arg"
+.Fo timeout_set
+.Fa "struct timeout *to"
+.Fa "void (*fn)(void *)"
+.Fa "void *arg"
+.Fc
 .Ft void
 .Fo timeout_set_flags
 .Fa "struct timeout *to"
 .Fa "void (*fn)(void *)"
 .Fa "void *arg"
+.Fa "int kclock"
 .Fa "int flags"
 .Fc
 .Ft void
-.Fn timeout_set_proc "struct timeout *to" "void (*fn)(void *)" "void *arg"
+.Fo timeout_set_proc
+.Fa "struct timeout *to"
+.Fa "void (*fn)(void *)"
+.Fa "void *arg"
+.Fc
 .Ft int
-.Fn timeout_add "struct timeout *to" "int ticks"
+.Fo timeout_add
+.Fa "struct timeout *to"
+.Fa "int nticks"
+.Fc
 .Ft int
-.Fn timeout_del "struct timeout *to"
+.Fo timeout_add_sec
+.Fa "struct timeout *to"
+.Fa "int secs"
+.Fc
 .Ft int
-.Fn timeout_del_barrier "struct timeout *to"
-.Ft void
-.Fn timeout_barrier "struct timeout *to"
+.Fo timeout_add_msec
+.Fa "struct timeout *to"
+.Fa "int msecs"
+.Fc
 .Ft int
-.Fn timeout_pending "struct timeout *to"
+.Fo timeout_add_usec
+.Fa "struct timeout *to"
+.Fa "int usecs"
+.Fc
 .Ft int
-.Fn timeout_initialized "struct timeout *to"
+.Fo timeout_add_nsec
+.Fa "struct timeout *to"
+.Fa "int nsecs"
+.Fc
 .Ft int
-.Fn timeout_triggered "struct timeout *to"
+.Fo timeout_add_tv
+.Fa "struct timeout *to"
+.Fa "struct timeval *tv"
+.Fc
 .Ft int
-.Fn timeout_add_tv "struct timeout *to" "struct timeval *"
+.Fo timeout_abs_ts
+.Fa "struct timeout *to"
+.Fa "const struct timespec *abs"
+.Fc
 .Ft int
-.Fn timeout_add_sec "struct timeout *to" "int sec"
+.Fo timeout_del
+.Fa "struct timeout *to"
+.Fc
 .Ft int
-.Fn timeout_add_msec "struct timeout *to" "int msec"
+.Fo timeout_del_barrier
+.Fa "struct timeout *to"
+.Fc
+.Ft void
+.Fo timeout_barrier
+.Fa "struct timeout *to"
+.Fc
+.Ft int
+.Fo timeout_pending
+.Fa "struct timeout *to"
+.Fc
 .Ft int
-.Fn timeout_add_usec "struct timeout *to" "int usec"
+.Fo timeout_initialized
+.Fa "struct timeout *to"
+.Fc
 .Ft int
-.Fn timeout_add_nsec "struct timeout *to" "int nsec"
-.Fn TIMEOUT_INITIALIZER "void (*fn)(void *)" "void *arg"
-.Fn TIMEOUT_INITIALIZER_FLAGS "void (*fn)(void *)" "void *arg" "int flags"
+.Fo timeout_triggered
+.Fa "struct timeout *to"
+.Fc
+.Fo TIMEOUT_INITIALIZER
+.Fa "void (*fn)(void *)"
+.Fa "void *arg"
+.Fc
+.Fo TIMEOUT_INITIALIZER_FLAGS
+.Fa "void (*fn)(void *)"
+.Fa "void *arg"
+.Fa "int kclock"
+.Fa "int flags"
+.Fc
 .Sh DESCRIPTION
 The
 .Nm timeout
-API provides a mechanism to execute a function at a given time.
-The granularity of the time is limited by the granularity of the
-.Xr hardclock 9
-timer which executes
-.Xr hz 9
-times a second.
+API provides a mechanism to schedule a function for asynchronous
+execution in the future.
 .Pp
-It is the responsibility of the caller to provide these functions with
-pre-allocated timeout structures.
+All state is encapsulated in a caller-allocated
+.Vt struct timeout
+.Po
+hereafter,
+a
+.Qq timeout
+.Pc .
+A timeout must be initialized before it may be 

  1   2   3   4   5   6   7   8   9   10   >