sparc64/clock.c: use ANSI-style function definitions

2021-02-23 Thread Scott Cheloha
I'm poking around in this file and lo, K function definitions.

Any reason not to ANSIfy these?  While we're here we can kill some
ARGUSED comments, too.

I don't have a sparc64 machine so I'd appreciate a test compile.

Assuming it compiles, ok?

Index: clock.c
===
RCS file: /cvs/src/sys/arch/sparc64/sparc64/clock.c,v
retrieving revision 1.68
diff -u -p -r1.68 clock.c
--- clock.c 23 Feb 2021 04:44:31 -  1.68
+++ clock.c 23 Feb 2021 15:12:31 -
@@ -215,10 +215,7 @@ int timerblurb = 10; /* Guess a value; u
  * own special match function to call it the "clock".
  */
 static int
-clockmatch_sbus(parent, cf, aux)
-   struct device *parent;
-   void *cf;
-   void *aux;
+clockmatch_sbus(struct device *parent, void *cf, void *aux)
 {
struct sbus_attach_args *sa = aux;
 
@@ -226,10 +223,7 @@ clockmatch_sbus(parent, cf, aux)
 }
 
 static int
-clockmatch_ebus(parent, cf, aux)
-   struct device *parent;
-   void *cf;
-   void *aux;
+clockmatch_ebus(struct device *parent, void *cf, void *aux)
 {
struct ebus_attach_args *ea = aux;
 
@@ -237,10 +231,7 @@ clockmatch_ebus(parent, cf, aux)
 }
 
 static int
-clockmatch_fhc(parent, cf, aux)
-   struct device *parent;
-   void *cf;
-   void *aux;
+clockmatch_fhc(struct device *parent, void *cf, void *aux)
 {
struct fhc_attach_args *fa = aux;
 
@@ -270,11 +261,8 @@ clockmatch_fhc(parent, cf, aux)
  * a non-trivial operation.  
  */
 
-/* ARGSUSED */
 static void
-clockattach_sbus(parent, self, aux)
-   struct device *parent, *self;
-   void *aux;
+clockattach_sbus(struct device *parent, struct device *self, void *aux)
 {
struct sbus_attach_args *sa = aux;
bus_space_tag_t bt = sa->sa_bustag;
@@ -309,9 +297,7 @@ clockattach_sbus(parent, self, aux)
  * change are not atomic.
  */
 int
-clock_bus_wenable(handle, onoff)
-   struct todr_chip_handle *handle;
-   int onoff;
+clock_bus_wenable(struct todr_chip_handle *handle, int onoff)
 {
int s, err = 0;
int prot; /* nonzero => change prot */
@@ -335,11 +321,8 @@ clock_bus_wenable(handle, onoff)
return (err);
 }
 
-/* ARGSUSED */
 static void
-clockattach_ebus(parent, self, aux)
-   struct device *parent, *self;
-   void *aux;
+clockattach_ebus(struct device *parent, struct device *self, void *aux)
 {
struct ebus_attach_args *ea = aux;
bus_space_tag_t bt;
@@ -380,9 +363,7 @@ clockattach_ebus(parent, self, aux)
 }
 
 static void
-clockattach_fhc(parent, self, aux)
-   struct device *parent, *self;
-   void *aux;
+clockattach_fhc(struct device *parent, struct device *self, void *aux)
 {
struct fhc_attach_args *fa = aux;
bus_space_tag_t bt = fa->fa_bustag;
@@ -409,10 +390,7 @@ clockattach_fhc(parent, self, aux)
 }
 
 static void
-clockattach(node, bt, bh)
-   int node;
-   bus_space_tag_t bt;
-   bus_space_handle_t bh;
+clockattach(int node, bus_space_tag_t bt, bus_space_handle_t bh)
 {
char *model;
struct idprom *idp;
@@ -467,10 +445,7 @@ getidprom(void)
  * the lame UltraSPARC IIi PCI machines that don't have them.
  */
 static int
-timermatch(parent, cf, aux)
-   struct device *parent;
-   void *cf;
-   void *aux;
+timermatch(struct device *parent, void *cf, void *aux)
 {
 #ifndef MULTIPROCESSOR
struct mainbus_attach_args *ma = aux;
@@ -483,9 +458,7 @@ timermatch(parent, cf, aux)
 }
 
 static void
-timerattach(parent, self, aux)
-   struct device *parent, *self;
-   void *aux;
+timerattach(struct device *parent, struct device *self, void *aux)
 {
struct mainbus_attach_args *ma = aux;
u_int *va = ma->ma_address;
@@ -518,8 +491,7 @@ timerattach(parent, self, aux)
 }
 
 void
-stopcounter(creg)
-   struct timer_4u *creg;
+stopcounter(struct timer_4u *creg)
 {
/* Stop the clock */
volatile int discard;
@@ -531,8 +503,7 @@ stopcounter(creg)
  * XXX this belongs elsewhere
  */
 void
-myetheraddr(cp)
-   u_char *cp;
+myetheraddr(u_char *cp)
 {
struct idprom *idp;
 
@@ -714,10 +685,8 @@ cpu_initclocks(void)
 /*
  * Dummy setstatclockrate(), since we know profhz==hz.
  */
-/* ARGSUSED */
 void
-setstatclockrate(newhz)
-   int newhz;
+setstatclockrate(int newhz)
 {
/* nothing */
 }
@@ -731,8 +700,7 @@ setstatclockrate(newhz)
 static int clockcheck = 0;
 #endif
 int
-clockintr(cap)
-   void *cap;
+clockintr(void *cap)
 {
 #ifdef DEBUG
static int64_t tick_base = 0;
@@ -778,8 +746,7 @@ clockintr(cap)
  * locore.s to a level 10.
  */
 int
-tickintr(cap)
-   void *cap;
+tickintr(void *cap)
 {
struct cpu_info *ci = curcpu();
u_int64_t s;
@@ -803,8 +770,7 @@ tickintr(cap)
 }
 
 int
-sys_tickintr(cap)
-   void *cap;
+sys_tickintr(void *cap)
 {
struct cpu_info *ci = curcpu();
u_int64_t s;
@@ -827,8 +793,7 @@ sys_tickintr(cap)
 }
 
 int

timecounting: use C99-style initialization for all timecounter structs

2021-02-18 Thread Scott Cheloha
Hi,

If the timecounter struct changes again in the future it will be
easier to make the change if we are using C99-style initialization
everywhere.  In general I think C99-style initialization is easier to
read for larger structs.  The timecounter struct definitely qualifies
as "larger".  We probably should already be doing this but nobody has
bothered yet.

So I will bother.  This patch changes every timecounter struct to use
C99-style initialization.  Some are already using it but most are not.

Yes, I am aware that this is tedious to review.  I'm sorry.  I think
suffering this now will pay off in the future.

Speaking of the future: in a subsequent patch I would like to remove
several of the the zero and NULL members, as C99 guarantees that
omission of a member at initialization causes it to be implicitly
zeroed.  For instance, there is no reason to set .tc_user if the
timecounter has no corresponding driver in libc.  There are also no
drivers setting the .tc_poll_pps function pointer, so we can just let
it implicitly be NULL.  And if the timecounter needs no private cookie
we don't need to explicitly set .tc_priv to NULL.  Et cetera.

I suppose if people prefer it we _could_ do such changes in this
patch.  I'm leaning toward not doing that.  Switching to the C99 style
*and* dropping members will make review more difficult and increase
the likelihood of a mistake, i.e. I will accidentally break the build
on some platform and people will yell at me, which I want to avoid.

Thoughts?  Preferences?  ok?

Index: ./arch/alpha/alpha/clock.c
===
RCS file: /cvs/src/sys/arch/alpha/alpha/clock.c,v
retrieving revision 1.24
diff -u -p -r1.24 clock.c
--- ./arch/alpha/alpha/clock.c  6 Jul 2020 13:33:06 -   1.24
+++ ./arch/alpha/alpha/clock.c  19 Feb 2021 02:57:55 -
@@ -64,7 +64,14 @@ int clk_irq = 0;
 
 u_int rpcc_get_timecount(struct timecounter *);
 struct timecounter rpcc_timecounter = {
-   rpcc_get_timecount, NULL, ~0u, 0, "rpcc", 0, NULL, 0
+   .tc_get_timecount = rpcc_get_timecount,
+   .tc_poll_pps = NULL,
+   .tc_counter_mask = ~0u,
+   .tc_frequency = 0,
+   .tc_name = "rpcc",
+   .tc_quality = 0,
+   .tc_priv = NULL,
+   .tc_user = 0,
 };
 
 extern todr_chip_handle_t todr_handle;
Index: ./arch/amd64/amd64/tsc.c
===
RCS file: /cvs/src/sys/arch/amd64/amd64/tsc.c,v
retrieving revision 1.22
diff -u -p -r1.22 tsc.c
--- ./arch/amd64/amd64/tsc.c24 Dec 2020 04:20:48 -  1.22
+++ ./arch/amd64/amd64/tsc.c19 Feb 2021 02:57:55 -
@@ -52,7 +52,14 @@ extern u_int32_t lapic_per_second;
 #endif
 
 struct timecounter tsc_timecounter = {
-   tsc_get_timecount, NULL, ~0u, 0, "tsc", -1000, NULL, TC_TSC
+   .tc_get_timecount = tsc_get_timecount,
+   .tc_poll_pps = NULL,
+   .tc_counter_mask = ~0u,
+   .tc_frequency = 0,
+   .tc_name = "tsc",
+   .tc_quality = -1000,
+   .tc_priv = NULL,
+   .tc_user = TC_TSC,
 };
 
 uint64_t
Index: ./arch/amd64/isa/clock.c
===
RCS file: /cvs/src/sys/arch/amd64/isa/clock.c,v
retrieving revision 1.34
diff -u -p -r1.34 clock.c
--- ./arch/amd64/isa/clock.c6 Jul 2020 13:33:06 -   1.34
+++ ./arch/amd64/isa/clock.c19 Feb 2021 02:57:55 -
@@ -116,7 +116,14 @@ u_int i8254_get_timecount(struct timecou
 u_int i8254_simple_get_timecount(struct timecounter *tc);
 
 static struct timecounter i8254_timecounter = {
-   i8254_get_timecount, NULL, ~0u, TIMER_FREQ, "i8254", 0, NULL, 0
+   .tc_get_timecount = i8254_get_timecount,
+   .tc_poll_pps = NULL,
+   .tc_counter_mask = ~0u,
+   .tc_frequency = TIMER_FREQ,
+   .tc_name = "i8254",
+   .tc_quality = 0,
+   .tc_priv = NULL,
+   .tc_user = 0,
 };
 
 intclockintr(void *);
Index: ./arch/armv7/omap/dmtimer.c
===
RCS file: /cvs/src/sys/arch/armv7/omap/dmtimer.c,v
retrieving revision 1.9
diff -u -p -r1.9 dmtimer.c
--- ./arch/armv7/omap/dmtimer.c 19 Jan 2021 18:04:43 -  1.9
+++ ./arch/armv7/omap/dmtimer.c 19 Feb 2021 02:57:55 -
@@ -111,7 +111,13 @@ void dmtimer_setstatclockrate(int newhz)
 u_int dmtimer_get_timecount(struct timecounter *);
 
 static struct timecounter dmtimer_timecounter = {
-   dmtimer_get_timecount, NULL, 0x, 0, "dmtimer", 0, NULL
+   .tc_get_timecount = dmtimer_get_timecount,
+   .tc_poll_pps = NULL,
+   .tc_counter_mask = 0x,
+   .tc_frequency = 0,
+   .tc_name = "dmtimer",
+   .tc_quality = 0,
+   .tc_priv = NULL,
 };
 
 bus_space_handle_t dmtimer_ioh0;
Index: ./arch/armv7/omap/gptimer.c
===
RCS file: /cvs/src/sys/arch/armv7/omap/gptimer.c,v
retrieving revision 1.11
diff -u -p -r1.11 gptimer.c
--- 

Re: all platforms: isolate hardclock(9) from statclock()

2021-01-14 Thread Scott Cheloha
On Thu, Jan 07, 2021 at 11:15:42AM -0600, Scott Cheloha wrote:
> Hi,
> 
> I want to isolate statclock() from hardclock(9).  This will simplify
> the logic in my WIP dynamic clock interrupt framework.
> 
> Currently, if stathz is zero, we call statclock() from within
> hardclock(9).  It looks like this (see sys/kern/kern_clock.c):
> 
> void
> hardclock(struct clockframe *frame)
> {
>   /* [...] */
> 
>   if (stathz == 0)
>   statclock(frame);
> 
>   /* [...] */
> 
> This is the case on alpha, amd64 (w/ lapic), hppa, i386 (w/ lapic),
> loongson, luna88k, mips64, and sh.
> 
> (We seem to do it on sgi, too.  I was under the impression that sgi
> *was* a mips64 platform, yet sgi seems to it have its own clock
> interrupt code distinct from mips64's general clock interrupt code
> which is used by e.g. octeon).
> 
> However, if stathz is not zero we call statclock() separately.  This
> is the case on armv7, arm, arm64, macppc, powerpc64, and sparc64.
> 
> (The situation for the general powerpc code and socppc in particular
> is a mystery to me.)
> 
> If we could remove this MD distinction it would make my MI framework
> simpler.  Instead of checking stathz and conditionally starting a
> statclock event I will be able to unconditionally start a statclock
> event on all platforms on every CPU.
> 
> In general I don't think the "is stathz zero?" variance between
> platforms is useful:
> 
> - The difference is invisible to userspace, as we hide the fact that
>   stathz is zero from e.g. the kern.clockrate sysctl.
> 
> - We run statclock() at some point, regardless of whether stathz is
>   zero.  If statclock() is run from hardclock(9) then isn't stathz
>   effectively equal to hz?
> 
> - Because stathz might be zero we need to add a bunch of safety checks
>   to our MI code to ensure we don't accidentally divide by zero.
> 
> Maybe we can ensure stathz is non-zero in a later diff...
> 
> --
> 
> Anyway, I don't think I have missed any platforms.  However, if
> platform experts could weigh in here to verify my changes (and test
> them!) I'd really appreciate it.
> 
> In particular, I'm confused about how clock interrupts work on
> powerpc, socppc, and sgi.

Updated diff.

- As noted by visa@, loongson already has a distinct statclock
  interrupt, so remove the changes to arch/loongson/dev/glxclk.c.

I have successful tests on these platforms:

- amd64
- luna88k
- hppa

Still need tests on these platforms:

- alpha
- i386
- mips64 (octeon will be sufficient)
- sgi
- sh

Related: is there a way to test i386 on amd64 hardware?

Index: sys/kern/kern_clock.c
===
RCS file: /cvs/src/sys/kern/kern_clock.c,v
retrieving revision 1.102
diff -u -p -r1.102 kern_clock.c
--- sys/kern/kern_clock.c   13 Jan 2021 16:28:49 -  1.102
+++ sys/kern/kern_clock.c   14 Jan 2021 18:39:03 -
@@ -160,12 +160,6 @@ hardclock(struct clockframe *frame)
}
}
 
-   /*
-* If no separate statistics clock is available, run it from here.
-*/
-   if (stathz == 0)
-   statclock(frame);
-
if (--ci->ci_schedstate.spc_rrticks <= 0)
roundrobin(ci);
 
Index: sys/arch/alpha/alpha/clock.c
===
RCS file: /cvs/src/sys/arch/alpha/alpha/clock.c,v
retrieving revision 1.24
diff -u -p -r1.24 clock.c
--- sys/arch/alpha/alpha/clock.c6 Jul 2020 13:33:06 -   1.24
+++ sys/arch/alpha/alpha/clock.c14 Jan 2021 18:39:04 -
@@ -136,6 +136,13 @@ clockattach(dev, fns)
  * Machine-dependent clock routines.
  */
 
+void
+clockintr(struct clockframe *frame)
+{
+   hardclock(frame);
+   statclock(frame);
+}
+
 /*
  * Start the real-time and statistics clocks. Leave stathz 0 since there
  * are no other timers available.
@@ -165,7 +172,7 @@ cpu_initclocks(void)
 * hardclock, which would then fall over because the pointer
 * to the virtual timers wasn't set at that time.
 */
-   platform.clockintr = hardclock;
+   platform.clockintr = clockintr;
schedhz = 16;
 
evcount_attach(_count, "clock", _irq);
Index: sys/arch/amd64/amd64/lapic.c
===
RCS file: /cvs/src/sys/arch/amd64/amd64/lapic.c,v
retrieving revision 1.57
diff -u -p -r1.57 lapic.c
--- sys/arch/amd64/amd64/lapic.c6 Sep 2020 20:50:00 -   1.57
+++ sys/arch/amd64/amd64/lapic.c14 Jan 2021 18:39:04 -
@@ -452,6 +452,7 @@ lapic_clockintr(void *arg, struct intrfr
floor = ci->ci_handled_intr_level;
ci->ci_handled_intr_level = ci->ci_ilevel;
hardclock((struct clockframe *));
+   sta

Re: tpm(4): don't use tvtohz(9)

2021-01-13 Thread Scott Cheloha
On Fri, Jan 08, 2021 at 08:21:14PM +0100, Florian Obser wrote:
> On Fri, Jan 08, 2021 at 11:48:33AM -0600, Scott Cheloha wrote:
> > On Fri, Jan 08, 2021 at 05:41:24PM +0100, Mark Kettenis wrote:
> > > > Date: Fri, 8 Jan 2021 10:27:38 -0600
> > > > From: Scott Cheloha 
> > > > 
> > > > On Wed, Jan 06, 2021 at 11:26:27PM +0100, Mark Kettenis wrote:
> > > > > > Date: Wed, 6 Jan 2021 16:16:27 -0600
> > > > > > From: Scott Cheloha 
> > > > > > 
> > > > > > On Wed, Jan 06, 2021 at 12:16:13PM -0600, Scott Cheloha wrote:
> > > > > > > As mentioned in a prior mail, tpm(4) is the last user of 
> > > > > > > tvtohz(9) in
> > > > > > > the tree.
> > > > > > > 
> > > > > > > However, we don't need to use tvtohz(9) in tpm(4) at all.  
> > > > > > > Converting
> > > > > > > from milliseconds to ticks is trivial.  Using an intermediary 
> > > > > > > timeval
> > > > > > > is just pointless indirection.
> > > > > > > 
> > > > > > > With this committed I will be able to remove both tvtohz(9) and
> > > > > > > tstohz(9) from the tree in a subsequent patch.
> > > > > > 
> > > > > > Whoops, made a mistake in the prior diff.
> > > > > > 
> > > > > > We want to MAX() the result up to 1 to avoid dividing to zero and
> > > > > > blocking indefinitely in e.g. tsleep(9).  Previous diff MIN'd the
> > > > > > result down to 1, which is very wrong.
> > > > > 
> > > > > To be honest I'd just zap the function completely and instead simply 
> > > > > do:
> > > > > 
> > > > > to = TPM_ACCESS_TMO / 10;
> > > > > 
> > > > > and
> > > > > 
> > > > > to = TPM_BURST_TMO / 10;
> > > > 
> > > > That won't work on custom kernels where HZ is not 100, no?
> > > 
> > > HZ is irrelevant here.  These TPM_XXX_TMO defines specify a timeout in
> > > microseconds and DELAY(10) just delays for 10us.  So you just need to
> > > figure out how many times you need to do the 10us delay to reach the
> > > timeout specified by TPM_XXX_TMO.
> > 
> > Whoops, yes, you're right.
> > 
> > > Also, this driver is only supported on amd64/i386 at this point (but
> > > might show up on arm64 at some point).
> > > 
> > > > > There is no magic happening here.  The code is just doing a hardware
> > > > > register poll loop in 10us steps.
> > > > > 
> > > > > Hmm, actually it seems the code is broken and uses steps of 10
> > > > > microseconds instead of milliseconds.  So instead it should probably
> > > > > use:
> > > > > 
> > > > > to = TPM_ACCESS_TMO * 100;
> > > > > 
> > > > > and
> > > > > 
> > > > > to = TPM_BURST_TMO * 100;
> > > > 
> > > > This problem came up in a different thread:
> > > > 
> > > > https://marc.info/?l=openbsd-tech=160833962329381=2
> > > > 
> > > > jcs@ said, and I paraphrase, "tpm(4) sucks, we should merge NetBSD's
> > > > latest code, which is much nicer now."
> > > > 
> > > > That sounds like a much taller order than I can fill, so I'm just
> > > > trying to remove the tvtohz(9) call without changing any behavior,
> > > > even if the behavior looks wrong (like using the wrong units).
> > > > 
> > > > I don't even have a tpm(4) device to test.  Until I have one I'm
> > > > reluctant to do things like expanding the delay time in these loops.
> > > > As of now the driver "works" for some subset of people, which is not
> > > > nothing.
> > > 
> > > But having illogical code is a problem as well.  So I think we should
> > > at least attempt to fix it the right way.  All it takes is to find
> > > someone with a laptop where the driver attaches and have them
> > > suspend/resume it.
> > 
> > We can try that, sure.  Here's the patch:
> > 
> > - Remove tpm_tmotohz().
> > 
> > - tpm_waitfor() does DELAY(1), so in that case convert from
> >   milliseconds to microseconds.  Note in the function argument
> >   that we expect milliseconds, not 

Re: all platforms: isolate hardclock(9) from statclock()

2021-01-13 Thread Scott Cheloha
On Sat, Jan 09, 2021 at 12:52:22PM -0600, Dale Rahn wrote:
> The 'magic' here was that MD code could choose to implement statclock (and
> set stathz appropriately), or MD code could not care about the multiple
> statclock/hardclock interfaces into the scheduler. Also some clock drivers
> on a platform could enable split hardclock/statclock where others did not.

Gotcha.

> Back near the beginning of OpenBSD platforms existed that had no higher
> precision clock than that timer interrupt, and nothing existed to even
> approximate higher precision (eg cycle counters or instruction counters).

My understanding is that HZ=100 was a practical choice because the
machines of the day could not reliably drive a faster clock interrupt.

> Some clock drivers have a separate timer/interrupt or separate 'event'
> tracked to schedule the stat() event. These platforms may also
> (dynamically) change the stathz clock when profiling is enabled/disabled.
> This is implemented in arm64 in agtimer_setstatclockrate()

My hope is to make this the case for *all* platforms with MI code.
More on that in a later patch.  You'll be CC'd.

> Any clock driver that directly calls statclock() should make certain to
> stathz (and profhz) appropriately, as no assumptions to it's rate/frequency
> should be assumed.

Do we need to properly set stathz for each platform in *this* diff?
Or can it wait?

I was hoping to do a sweep of the tree in a later patch and ensure
that stathz is non-zero everywhere and simultaneously remove code like
this:

int realstathz = stathz ? stathz : hz;

Near as I can tell, stathz doesn't need to be nonzero for statclock()
to work correctly.  Also, setstatclockrate() doesn't even run unless
stathz is non-zero, so there are no issues with the profiling clock
stuff yet.

> This isn't to say that removing the stathz== 0 magic should not be done,
> but if done, make certain that stathz and profhz are properly
> updated/configured.



sysctl(8), kernel: remove dead variable: tickadj

2021-01-08 Thread Scott Cheloha
The global variable "tickadj" has no users in the kernel anymore and
should be eliminated from all code and documentation.

At one time "tickadj" controlled the skew rate for adjtime(2), but it
has been unused since the modern timecounting subsystem was imported
from FreeBSD circa 2004-2005.

FreeBSD noted this vestigiality and removed the variable in 2002:

https://cgit.freebsd.org/src/commit/?id=e1d970f1811e5e1e9c912c032acdcec6521b2a6d

ok?

Index: lib/libc/sys/sysctl.2
===
RCS file: /cvs/src/lib/libc/sys/sysctl.2,v
retrieving revision 1.42
diff -u -p -r1.42 sysctl.2
--- lib/libc/sys/sysctl.2   29 Dec 2020 12:28:23 -  1.42
+++ lib/libc/sys/sysctl.2   9 Jan 2021 02:03:11 -
@@ -550,9 +550,8 @@ The scheduler exponential decay value.
 A
 .Vt struct clockinfo
 structure is returned.
-This structure contains the clock, statistics clock and profiling clock
-frequencies, the number of micro-seconds per hz tick, and the clock
-skew rate.
+This structure contains the hardclock, statistics clock and profiling clock
+frequencies, and the number of microseconds per hardclock tick.
 .It Dv KERN_CONSDEV Pq Va kern.consdev
 The console device.
 .It Dv KERN_CPTIME Pq Va kern.cp_time
Index: sys/conf/param.c
===
RCS file: /cvs/src/sys/conf/param.c,v
retrieving revision 1.45
diff -u -p -r1.45 param.c
--- sys/conf/param.c7 Sep 2019 01:23:23 -   1.45
+++ sys/conf/param.c9 Jan 2021 02:03:11 -
@@ -75,7 +75,6 @@
 inthz = HZ;
 inttick = 100 / HZ;
 inttick_nsec = 10 / HZ;
-inttickadj = 24 / (60 * HZ);   /* can adjust 240ms in 60s */
 intutc_offset = 0;
 #defineNPROCESS (30 + 16 * MAXUSERS)
 #defineNTEXT (80 + NPROCESS / 8)   /* actually the object 
cache */
Index: sys/arch/mips64/mips64/mips64_machdep.c
===
RCS file: /cvs/src/sys/arch/mips64/mips64/mips64_machdep.c,v
retrieving revision 1.33
diff -u -p -r1.33 mips64_machdep.c
--- sys/arch/mips64/mips64/mips64_machdep.c 11 Jul 2020 15:18:08 -  
1.33
+++ sys/arch/mips64/mips64/mips64_machdep.c 9 Jan 2021 02:03:11 -
@@ -323,14 +323,13 @@ cp0_calibrate(struct cpu_info *ci)
  * Start the real-time and statistics clocks.
  */
 void
-cpu_initclocks()
+cpu_initclocks(void)
 {
struct cpu_info *ci = curcpu();
 
profhz = hz;
 
tick = 100 / hz;/* number of micro-seconds between interrupts */
-   tickadj = 24 / (60 * hz);   /* can adjust 240ms in 60s */
 
cp0_calibrate(ci);
 
Index: sys/kern/kern_clock.c
===
RCS file: /cvs/src/sys/kern/kern_clock.c,v
retrieving revision 1.101
diff -u -p -r1.101 kern_clock.c
--- sys/kern/kern_clock.c   21 Jan 2020 16:16:23 -  1.101
+++ sys/kern/kern_clock.c   9 Jan 2021 02:03:11 -
@@ -120,10 +120,6 @@ initclocks(void)
profhz = i;
psratio = profhz / i;
 
-   /* For very large HZ, ensure that division by 0 does not occur later */
-   if (tickadj == 0)
-   tickadj = 1;
-
inittimecounter();
 }
 
@@ -421,7 +417,6 @@ sysctl_clockrate(char *where, size_t *si
 */
memset(, 0, sizeof clkinfo);
clkinfo.tick = tick;
-   clkinfo.tickadj = tickadj;
clkinfo.hz = hz;
clkinfo.profhz = profhz;
clkinfo.stathz = stathz ? stathz : hz;
Index: sys/sys/kernel.h
===
RCS file: /cvs/src/sys/sys/kernel.h,v
retrieving revision 1.24
diff -u -p -r1.24 kernel.h
--- sys/sys/kernel.h15 Oct 2020 15:36:31 -  1.24
+++ sys/sys/kernel.h9 Jan 2021 02:03:11 -
@@ -51,7 +51,6 @@ extern int utc_offset;/* seconds east 
 
 extern int tick;   /* usec per tick (100 / hz) */
 extern int tick_nsec;  /* nsec per tick */
-extern int tickadj;/* "standard" clock skew, us./tick */
 extern int ticks;  /* # of hardclock ticks */
 extern int hz; /* system clock's frequency */
 extern int stathz; /* statistics clock's frequency */
Index: sys/sys/time.h
===
RCS file: /cvs/src/sys/sys/time.h,v
retrieving revision 1.57
diff -u -p -r1.57 time.h
--- sys/sys/time.h  15 Oct 2020 16:31:11 -  1.57
+++ sys/sys/time.h  9 Jan 2021 02:03:11 -
@@ -157,7 +157,6 @@ struct  itimerval {
 struct clockinfo {
int hz; /* clock frequency */
int tick;   /* micro-seconds per hz tick */
-   int tickadj;/* clock skew rate for adjtime() */
int stathz; /* statistics clock frequency */
int profhz; /* profiling clock frequency */
 };

Re: sleep_setup/finish simplification

2021-01-08 Thread Scott Cheloha
On Mon, Dec 28, 2020 at 11:41:52AM -0300, Martin Pieuchot wrote:
> On 08/12/20(Tue) 10:06, Martin Pieuchot wrote:
> > Diff below aims to simplify the API to put a thread on a sleep queue and
> > reduce it to the following:
> > 
> > sleep_setup();
> > /* check condition or release lock */
> > sleep_finish();
> > 
> > It is motivated by my work to sleep the SCHED_LOCK() but might as well
> > prevent/fix some bugs.
> > 
> > The tricky part of the current implementation is that sleep_setup_signal()
> > can already park/stop the current thread resulting in a context change.
> > Should any custom accounting / lock check happen before that?  At least
> > two lock primitives do so currently:  drm's schedule_timeout() and
> > rwlock's rw_enter().
> > 
> > As a result of this diff various states can be removed and sleep_finish()
> > contains the following magic:
> > 
> > 1. check for signal/parking
> > 2. context switch or remove from sleep queue
> > 3. check for signal/parking
> > 
> > Note that sleep_finish() could be simplified even further but I left
> > that for later to ease the review.
> > 
> > Comments?  Oks?
> 
> Anyone?

I really like this simplification.

It also makes my forthcoming kclock changes to tsleep_nsec(9)/etc.
simpler, so it's doubly good for me.

I was hoping someone would step forward and OK this but nobody did, at
least not publicly.

I see claudio@ is trying to break off a piece of this for commit in a
different thread.  Unsure if that means this is dead or just being cut
up and merged piecemeal.

FWIW, ok cheloha@.  Obviously you need more OKs.

Even if this is dead, some other simplification in this vein would be
nice.

> > Index: dev/dt/dt_dev.c
> > ===
> > RCS file: /cvs/src/sys/dev/dt/dt_dev.c,v
> > retrieving revision 1.10
> > diff -u -p -r1.10 dt_dev.c
> > --- dev/dt/dt_dev.c 28 Sep 2020 13:16:58 -  1.10
> > +++ dev/dt/dt_dev.c 7 Dec 2020 17:19:15 -
> > @@ -225,10 +225,8 @@ dtread(dev_t dev, struct uio *uio, int f
> > return (EMSGSIZE);
> >  
> > while (!sc->ds_evtcnt) {
> > -   sleep_setup(, sc, PWAIT | PCATCH, "dtread");
> > -   sleep_setup_signal();
> > -   sleep_finish(, !sc->ds_evtcnt);
> > -   error = sleep_finish_signal();
> > +   sleep_setup(, sc, PWAIT | PCATCH, "dtread", 0);
> > +   error = sleep_finish(, !sc->ds_evtcnt);
> > if (error == EINTR || error == ERESTART)
> > break;
> > }
> > Index: dev/pci/drm/drm_linux.c
> > ===
> > RCS file: /cvs/src/sys/dev/pci/drm/drm_linux.c,v
> > retrieving revision 1.70
> > diff -u -p -r1.70 drm_linux.c
> > --- dev/pci/drm/drm_linux.c 14 Nov 2020 23:08:47 -  1.70
> > +++ dev/pci/drm/drm_linux.c 7 Dec 2020 17:19:15 -
> > @@ -110,26 +110,23 @@ schedule_timeout(long timeout)
> >  {
> > struct sleep_state sls;
> > long deadline;
> > -   int wait, spl;
> > +   int wait, spl, timo = 0;
> >  
> > MUTEX_ASSERT_LOCKED(_mtx);
> > KASSERT(!cold);
> >  
> > -   sleep_setup(, sch_ident, sch_priority, "schto");
> > if (timeout != MAX_SCHEDULE_TIMEOUT)
> > -   sleep_setup_timeout(, timeout);
> > +   timo = timeout;
> > +   sleep_setup(, sch_ident, sch_priority, "schto", timo);
> >  
> > wait = (sch_proc == curproc && timeout > 0);
> >  
> > spl = MUTEX_OLDIPL(_mtx);
> > MUTEX_OLDIPL(_mtx) = splsched();
> > mtx_leave(_mtx);
> > -
> > -   sleep_setup_signal();
> > -
> > if (timeout != MAX_SCHEDULE_TIMEOUT)
> > deadline = ticks + timeout;
> > -   sleep_finish_all(, wait);
> > +   sleep_finish(, wait);
> > if (timeout != MAX_SCHEDULE_TIMEOUT)
> > timeout = deadline - ticks;
> >  
> > Index: dev/pci/if_myx.c
> > ===
> > RCS file: /cvs/src/sys/dev/pci/if_myx.c,v
> > retrieving revision 1.112
> > diff -u -p -r1.112 if_myx.c
> > --- dev/pci/if_myx.c27 Nov 2020 00:13:15 -  1.112
> > +++ dev/pci/if_myx.c7 Dec 2020 17:19:15 -
> > @@ -1396,7 +1396,7 @@ myx_down(struct myx_softc *sc)
> > (void)myx_cmd(sc, MYXCMD_SET_IFDOWN, , NULL);
> >  
> > while (sc->sc_state != MYX_S_OFF) {
> > -   sleep_setup(, sts, PWAIT, "myxdown");
> > +   sleep_setup(, sts, PWAIT, "myxdown", 0);
> > membar_consumer();
> > sleep_finish(, sc->sc_state != MYX_S_OFF);
> > }
> > Index: kern/kern_rwlock.c
> > ===
> > RCS file: /cvs/src/sys/kern/kern_rwlock.c,v
> > retrieving revision 1.45
> > diff -u -p -r1.45 kern_rwlock.c
> > --- kern/kern_rwlock.c  2 Mar 2020 17:07:49 -   1.45
> > +++ kern/kern_rwlock.c  7 Dec 2020 17:19:15 -
> > @@ -278,15 +278,13 @@ retry:
> > prio = op->wait_prio;
> > if 

Re: tpm(4): don't use tvtohz(9)

2021-01-08 Thread Scott Cheloha
On Fri, Jan 08, 2021 at 05:41:24PM +0100, Mark Kettenis wrote:
> > Date: Fri, 8 Jan 2021 10:27:38 -0600
> > From: Scott Cheloha 
> > 
> > On Wed, Jan 06, 2021 at 11:26:27PM +0100, Mark Kettenis wrote:
> > > > Date: Wed, 6 Jan 2021 16:16:27 -0600
> > > > From: Scott Cheloha 
> > > > 
> > > > On Wed, Jan 06, 2021 at 12:16:13PM -0600, Scott Cheloha wrote:
> > > > > As mentioned in a prior mail, tpm(4) is the last user of tvtohz(9) in
> > > > > the tree.
> > > > > 
> > > > > However, we don't need to use tvtohz(9) in tpm(4) at all.  Converting
> > > > > from milliseconds to ticks is trivial.  Using an intermediary timeval
> > > > > is just pointless indirection.
> > > > > 
> > > > > With this committed I will be able to remove both tvtohz(9) and
> > > > > tstohz(9) from the tree in a subsequent patch.
> > > > 
> > > > Whoops, made a mistake in the prior diff.
> > > > 
> > > > We want to MAX() the result up to 1 to avoid dividing to zero and
> > > > blocking indefinitely in e.g. tsleep(9).  Previous diff MIN'd the
> > > > result down to 1, which is very wrong.
> > > 
> > > To be honest I'd just zap the function completely and instead simply do:
> > > 
> > > to = TPM_ACCESS_TMO / 10;
> > > 
> > > and
> > > 
> > > to = TPM_BURST_TMO / 10;
> > 
> > That won't work on custom kernels where HZ is not 100, no?
> 
> HZ is irrelevant here.  These TPM_XXX_TMO defines specify a timeout in
> microseconds and DELAY(10) just delays for 10us.  So you just need to
> figure out how many times you need to do the 10us delay to reach the
> timeout specified by TPM_XXX_TMO.

Whoops, yes, you're right.

> Also, this driver is only supported on amd64/i386 at this point (but
> might show up on arm64 at some point).
> 
> > > There is no magic happening here.  The code is just doing a hardware
> > > register poll loop in 10us steps.
> > > 
> > > Hmm, actually it seems the code is broken and uses steps of 10
> > > microseconds instead of milliseconds.  So instead it should probably
> > > use:
> > > 
> > > to = TPM_ACCESS_TMO * 100;
> > > 
> > > and
> > > 
> > > to = TPM_BURST_TMO * 100;
> > 
> > This problem came up in a different thread:
> > 
> > https://marc.info/?l=openbsd-tech=160833962329381=2
> > 
> > jcs@ said, and I paraphrase, "tpm(4) sucks, we should merge NetBSD's
> > latest code, which is much nicer now."
> > 
> > That sounds like a much taller order than I can fill, so I'm just
> > trying to remove the tvtohz(9) call without changing any behavior,
> > even if the behavior looks wrong (like using the wrong units).
> > 
> > I don't even have a tpm(4) device to test.  Until I have one I'm
> > reluctant to do things like expanding the delay time in these loops.
> > As of now the driver "works" for some subset of people, which is not
> > nothing.
> 
> But having illogical code is a problem as well.  So I think we should
> at least attempt to fix it the right way.  All it takes is to find
> someone with a laptop where the driver attaches and have them
> suspend/resume it.

We can try that, sure.  Here's the patch:

- Remove tpm_tmotohz().

- tpm_waitfor() does DELAY(1), so in that case convert from
  milliseconds to microseconds.  Note in the function argument
  that we expect milliseconds, not "tries".

- In the other cases we do DELAY(10), so we only multiply by 100.
  Leave a comment explaining what we're doing.

Unsure who can test.  We need a suspend/resume test with this patch.
Probably an older machine.  Newer machines tend to have TPM 2.0 chips.

Index: tpm.c
===
RCS file: /cvs/src/sys/dev/acpi/tpm.c,v
retrieving revision 1.10
diff -u -p -r1.10 tpm.c
--- tpm.c   22 May 2020 10:16:37 -  1.10
+++ tpm.c   8 Jan 2021 17:43:35 -
@@ -158,7 +158,6 @@ int tpm_request_locality(struct tpm_soft
 void   tpm_release_locality(struct tpm_softc *);
 inttpm_getburst(struct tpm_softc *);
 uint8_ttpm_status(struct tpm_softc *);
-inttpm_tmotohz(int);
 
 struct cfattach tpm_ca = {
sizeof(struct tpm_softc),
@@ -385,7 +384,7 @@ tpm_request_locality(struct tpm_softc *s
bus_space_write_1(sc->sc_bt, sc->sc_bh, TPM_ACCESS,
TPM_ACCESS_REQUEST_USE);
 
-   to = tpm_tmotohz(TPM_ACCESS_TMO);
+   to = TPM_ACCESS_TMO * 100;  /* steps of 10 microseconds */
 
while (

Re: tpm(4): don't use tvtohz(9)

2021-01-08 Thread Scott Cheloha
On Wed, Jan 06, 2021 at 11:26:27PM +0100, Mark Kettenis wrote:
> > Date: Wed, 6 Jan 2021 16:16:27 -0600
> > From: Scott Cheloha 
> > 
> > On Wed, Jan 06, 2021 at 12:16:13PM -0600, Scott Cheloha wrote:
> > > As mentioned in a prior mail, tpm(4) is the last user of tvtohz(9) in
> > > the tree.
> > > 
> > > However, we don't need to use tvtohz(9) in tpm(4) at all.  Converting
> > > from milliseconds to ticks is trivial.  Using an intermediary timeval
> > > is just pointless indirection.
> > > 
> > > With this committed I will be able to remove both tvtohz(9) and
> > > tstohz(9) from the tree in a subsequent patch.
> > 
> > Whoops, made a mistake in the prior diff.
> > 
> > We want to MAX() the result up to 1 to avoid dividing to zero and
> > blocking indefinitely in e.g. tsleep(9).  Previous diff MIN'd the
> > result down to 1, which is very wrong.
> 
> To be honest I'd just zap the function completely and instead simply do:
> 
> to = TPM_ACCESS_TMO / 10;
> 
> and
> 
> to = TPM_BURST_TMO / 10;

That won't work on custom kernels where HZ is not 100, no?

> There is no magic happening here.  The code is just doing a hardware
> register poll loop in 10us steps.
> 
> Hmm, actually it seems the code is broken and uses steps of 10
> microseconds instead of milliseconds.  So instead it should probably
> use:
> 
> to = TPM_ACCESS_TMO * 100;
> 
> and
> 
> to = TPM_BURST_TMO * 100;

This problem came up in a different thread:

https://marc.info/?l=openbsd-tech=160833962329381=2

jcs@ said, and I paraphrase, "tpm(4) sucks, we should merge NetBSD's
latest code, which is much nicer now."

That sounds like a much taller order than I can fill, so I'm just
trying to remove the tvtohz(9) call without changing any behavior,
even if the behavior looks wrong (like using the wrong units).

I don't even have a tpm(4) device to test.  Until I have one I'm
reluctant to do things like expanding the delay time in these loops.
As of now the driver "works" for some subset of people, which is not
nothing.



Re: all platforms: isolate hardclock(9) from statclock()

2021-01-08 Thread Scott Cheloha
On Thu, Jan 07, 2021 at 08:12:10PM -0600, Scott Cheloha wrote:
> On Thu, Jan 07, 2021 at 09:37:58PM +0100, Mark Kettenis wrote:
> > > Date: Thu, 7 Jan 2021 11:15:41 -0600
> > > From: Scott Cheloha 
> > > 
> > > Hi,
> > > 
> > > I want to isolate statclock() from hardclock(9).  This will simplify
> > > the logic in my WIP dynamic clock interrupt framework.
> > > 
> > > Currently, if stathz is zero, we call statclock() from within
> > > hardclock(9).  It looks like this (see sys/kern/kern_clock.c):
> > > 
> > > void
> > > hardclock(struct clockframe *frame)
> > > {
> > >   /* [...] */
> > > 
> > >   if (stathz == 0)
> > >   statclock(frame);
> > > 
> > >   /* [...] */
> > > 
> > > This is the case on alpha, amd64 (w/ lapic), hppa, i386 (w/ lapic),
> > > loongson, luna88k, mips64, and sh.
> > > 
> > > (We seem to do it on sgi, too.  I was under the impression that sgi
> > > *was* a mips64 platform, yet sgi seems to it have its own clock
> > > interrupt code distinct from mips64's general clock interrupt code
> > > which is used by e.g. octeon).
> > > 
> > > However, if stathz is not zero we call statclock() separately.  This
> > > is the case on armv7, arm, arm64, macppc, powerpc64, and sparc64.
> > > 
> > > (The situation for the general powerpc code and socppc in particular
> > > is a mystery to me.)
> > > 
> > > If we could remove this MD distinction it would make my MI framework
> > > simpler.  Instead of checking stathz and conditionally starting a
> > > statclock event I will be able to unconditionally start a statclock
> > > event on all platforms on every CPU.
> > > 
> > > In general I don't think the "is stathz zero?" variance between
> > > platforms is useful:
> > > 
> > > - The difference is invisible to userspace, as we hide the fact that
> > >   stathz is zero from e.g. the kern.clockrate sysctl.
> > > 
> > > - We run statclock() at some point, regardless of whether stathz is
> > >   zero.  If statclock() is run from hardclock(9) then isn't stathz
> > >   effectively equal to hz?
> > > 
> > > - Because stathz might be zero we need to add a bunch of safety checks
> > >   to our MI code to ensure we don't accidentally divide by zero.
> > > 
> > > Maybe we can ensure stathz is non-zero in a later diff...
> > > 
> > > --
> > > 
> > > Anyway, I don't think I have missed any platforms.  However, if
> > > platform experts could weigh in here to verify my changes (and test
> > > them!) I'd really appreciate it.
> > > 
> > > In particular, I'm confused about how clock interrupts work on
> > > powerpc, socppc, and sgi.
> > > 
> > > --
> > > 
> > > Thoughts?  Platform-specific OKs?
> > 
> > I wouldn't be opposed to doing this.  It is less magic!
> > 
> > But yes, this needs to be tested on the platforms that you change.
> 
> I guess I'll CC all the platform-specific people I'm aware of.
> 
> > Note that many platforms don't have have separate schedclock and
> > statclock.  But on many platforms where we use a one-shot timer as the
> > clock we have a randomized statclock.  I'm sure Theo would love to
> > tell you about the cpuhog story...
> 
> I am familiar with cpuhog.  It's the one thing everybody mentions when
> I talk about clock interrupts and/or statclock().
> 
> Related:
> 
> I wonder if we could avoid the cpuhog problem entirely by implementing
> some kind of MI cycle counting clock API that we use to timestamp
> whenever we cross the syscall boundary, or enter an interrupt, etc.,
> to determine the time a thread spends using the CPU without any
> sampling error.
> 
> Instead of a process accumulating ticks from a sampling clock
> interrupt you would accumulate, say, a 64-bit count of cycles, or
> something like that.
> 
> Sampling with a regular clock interrupt is prone to error and trickery
> like cpuhog.  The classic BSD solution to the cpuhog exploit was to
> randomize the statclock/schedclock to make it harder to fool the
> sampler.  But if we used cycle counts or instruction counts at each
> state transition it would be impossible to fool because we wouldn't be
> sampling at all.
> 
> Unsure what the performance implications would be, but in general I
> would guess that most platforms have a way to count instructions or
> cycles and that reading t

Re: all platforms: isolate hardclock(9) from statclock()

2021-01-07 Thread Scott Cheloha
On Thu, Jan 07, 2021 at 09:37:58PM +0100, Mark Kettenis wrote:
> > Date: Thu, 7 Jan 2021 11:15:41 -0600
> > From: Scott Cheloha 
> > 
> > Hi,
> > 
> > I want to isolate statclock() from hardclock(9).  This will simplify
> > the logic in my WIP dynamic clock interrupt framework.
> > 
> > Currently, if stathz is zero, we call statclock() from within
> > hardclock(9).  It looks like this (see sys/kern/kern_clock.c):
> > 
> > void
> > hardclock(struct clockframe *frame)
> > {
> > /* [...] */
> > 
> > if (stathz == 0)
> > statclock(frame);
> > 
> > /* [...] */
> > 
> > This is the case on alpha, amd64 (w/ lapic), hppa, i386 (w/ lapic),
> > loongson, luna88k, mips64, and sh.
> > 
> > (We seem to do it on sgi, too.  I was under the impression that sgi
> > *was* a mips64 platform, yet sgi seems to it have its own clock
> > interrupt code distinct from mips64's general clock interrupt code
> > which is used by e.g. octeon).
> > 
> > However, if stathz is not zero we call statclock() separately.  This
> > is the case on armv7, arm, arm64, macppc, powerpc64, and sparc64.
> > 
> > (The situation for the general powerpc code and socppc in particular
> > is a mystery to me.)
> > 
> > If we could remove this MD distinction it would make my MI framework
> > simpler.  Instead of checking stathz and conditionally starting a
> > statclock event I will be able to unconditionally start a statclock
> > event on all platforms on every CPU.
> > 
> > In general I don't think the "is stathz zero?" variance between
> > platforms is useful:
> > 
> > - The difference is invisible to userspace, as we hide the fact that
> >   stathz is zero from e.g. the kern.clockrate sysctl.
> > 
> > - We run statclock() at some point, regardless of whether stathz is
> >   zero.  If statclock() is run from hardclock(9) then isn't stathz
> >   effectively equal to hz?
> > 
> > - Because stathz might be zero we need to add a bunch of safety checks
> >   to our MI code to ensure we don't accidentally divide by zero.
> > 
> > Maybe we can ensure stathz is non-zero in a later diff...
> > 
> > --
> > 
> > Anyway, I don't think I have missed any platforms.  However, if
> > platform experts could weigh in here to verify my changes (and test
> > them!) I'd really appreciate it.
> > 
> > In particular, I'm confused about how clock interrupts work on
> > powerpc, socppc, and sgi.
> > 
> > --
> > 
> > Thoughts?  Platform-specific OKs?
> 
> I wouldn't be opposed to doing this.  It is less magic!
> 
> But yes, this needs to be tested on the platforms that you change.

I guess I'll CC all the platform-specific people I'm aware of.

> Note that many platforms don't have have separate schedclock and
> statclock.  But on many platforms where we use a one-shot timer as the
> clock we have a randomized statclock.  I'm sure Theo would love to
> tell you about the cpuhog story...

I am familiar with cpuhog.  It's the one thing everybody mentions when
I talk about clock interrupts and/or statclock().

Related:

I wonder if we could avoid the cpuhog problem entirely by implementing
some kind of MI cycle counting clock API that we use to timestamp
whenever we cross the syscall boundary, or enter an interrupt, etc.,
to determine the time a thread spends using the CPU without any
sampling error.

Instead of a process accumulating ticks from a sampling clock
interrupt you would accumulate, say, a 64-bit count of cycles, or
something like that.

Sampling with a regular clock interrupt is prone to error and trickery
like cpuhog.  The classic BSD solution to the cpuhog exploit was to
randomize the statclock/schedclock to make it harder to fool the
sampler.  But if we used cycle counts or instruction counts at each
state transition it would be impossible to fool because we wouldn't be
sampling at all.

Unsure what the performance implications would be, but in general I
would guess that most platforms have a way to count instructions or
cycles and that reading this data is fast enough for us to use it in
e.g. syscall() or the interrupt handler without a huge performance
hit.

> Anyway, we probably want that on amd64 as well.

My WIP dynamic clock interrupt system can run a randomized statclock()
on amd64 boxes with a lapic.  I imagine we will be able to do the same
on i386 systems that have a lapic, too, though it will be slower
because all the i386 timecounters are glacial compared to the TSC.

Eventually I want to isolate schedclock() from statclock() and run it
as an independent event.  But that's a "later on" goal

Re: syncer_thread: sleep without lbolt

2021-01-07 Thread Scott Cheloha
On Sat, Dec 12, 2020 at 01:32:13PM -0600, Scott Cheloha wrote:
> Hi,
> 
> The syncer thread is one of the last users of the lbolt (lightning
> bolt!) sleep channel.
> 
> If we add a syncer-specific sleep channel (syncer_chan) and do a bit
> of time math we can replicate the current behavior and remove another
> lbolt user.
> 
> This isn't a perfect recreation of the current behavior.  In this
> version the sleep period will drift if processing takes longer than 1
> second.  I think it's good enough.  If people are concerned about a
> perfect recreation of the current behavior we *can* do it, but it will
> require more code.  I don't think it's worth it.
> 
> This also fixes two problems in the current code.  They aren't huge
> bugs, but they did jump out as potential problems because they make
> the syncer's behavior less deterministic:
> 
> - The current code uses gettime(9), which will jump and screw up your
>   measurement if someone calls settimeofday(2).  The new code uses the
>   uptime clock, which is monotonic and stable.
> 
> - The current code uses gettime(9), which has a resolution of 1
>   second.  Measuring a 1 second timeout with an interface with
>   a resolution of 1 second is crude and error-prone.  The new code
>   uses getnsecuptime(), which has a resolution of roughly 1/hz.
>   Much better.
> 
> I vaguely recall beck@ trying to do something with this in the recent
> past, so CC beck@.

1-ish month bump.

Index: vfs_sync.c
===
RCS file: /cvs/src/sys/kern/vfs_sync.c,v
retrieving revision 1.64
diff -u -p -r1.64 vfs_sync.c
--- vfs_sync.c  24 Jun 2020 22:03:41 -  1.64
+++ vfs_sync.c  25 Dec 2020 17:09:00 -
@@ -48,6 +48,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -73,6 +74,7 @@ LIST_HEAD(synclist, vnode);
 static struct synclist *syncer_workitem_pending;
 
 struct proc *syncerproc;
+int syncer_chan;
 
 /*
  * The workitem queue.
@@ -130,6 +132,19 @@ vn_syncer_add_to_worklist(struct vnode *
 }
 
 /*
+ * TODO Move getnsecuptime() to kern_tc.c and document it when we have
+ * more users in the kernel.
+ */
+static uint64_t
+getnsecuptime(void)
+{
+   struct timespec now;
+
+   getnanouptime();
+   return TIMESPEC_TO_NSEC();
+}
+
+/*
  * System filesystem synchronizer daemon.
  */
 void
@@ -138,11 +153,11 @@ syncer_thread(void *arg)
struct proc *p = curproc;
struct synclist *slp;
struct vnode *vp;
-   time_t starttime;
+   uint64_t elapsed, start;
int s;
 
for (;;) {
-   starttime = gettime();
+   start = getnsecuptime();
 
/*
 * Push files whose dirty time has expired.
@@ -220,6 +235,7 @@ syncer_thread(void *arg)
rushjob -= 1;
continue;
}
+
/*
 * If it has taken us less than a second to process the
 * current work, then wait. Otherwise start right over
@@ -228,8 +244,11 @@ syncer_thread(void *arg)
 * matter as we are just trying to generally pace the
 * filesystem activity.
 */
-   if (gettime() == starttime)
-   tsleep_nsec(, PPAUSE, "syncer", INFSLP);
+   elapsed = getnsecuptime() - start;
+   if (elapsed < SEC_TO_NSEC(1)) {
+   tsleep_nsec(_chan, PPAUSE, "syncer",
+   SEC_TO_NSEC(1) - elapsed);
+   }
}
 }
 
@@ -242,7 +261,7 @@ int
 speedup_syncer(void)
 {
if (syncerproc)
-   wakeup_proc(syncerproc, );
+   wakeup_proc(syncerproc, _chan);
if (rushjob < syncdelay / 2) {
rushjob += 1;
stat_rush_requests += 1;



all platforms: isolate hardclock(9) from statclock()

2021-01-07 Thread Scott Cheloha
Hi,

I want to isolate statclock() from hardclock(9).  This will simplify
the logic in my WIP dynamic clock interrupt framework.

Currently, if stathz is zero, we call statclock() from within
hardclock(9).  It looks like this (see sys/kern/kern_clock.c):

void
hardclock(struct clockframe *frame)
{
/* [...] */

if (stathz == 0)
statclock(frame);

/* [...] */

This is the case on alpha, amd64 (w/ lapic), hppa, i386 (w/ lapic),
loongson, luna88k, mips64, and sh.

(We seem to do it on sgi, too.  I was under the impression that sgi
*was* a mips64 platform, yet sgi seems to it have its own clock
interrupt code distinct from mips64's general clock interrupt code
which is used by e.g. octeon).

However, if stathz is not zero we call statclock() separately.  This
is the case on armv7, arm, arm64, macppc, powerpc64, and sparc64.

(The situation for the general powerpc code and socppc in particular
is a mystery to me.)

If we could remove this MD distinction it would make my MI framework
simpler.  Instead of checking stathz and conditionally starting a
statclock event I will be able to unconditionally start a statclock
event on all platforms on every CPU.

In general I don't think the "is stathz zero?" variance between
platforms is useful:

- The difference is invisible to userspace, as we hide the fact that
  stathz is zero from e.g. the kern.clockrate sysctl.

- We run statclock() at some point, regardless of whether stathz is
  zero.  If statclock() is run from hardclock(9) then isn't stathz
  effectively equal to hz?

- Because stathz might be zero we need to add a bunch of safety checks
  to our MI code to ensure we don't accidentally divide by zero.

Maybe we can ensure stathz is non-zero in a later diff...

--

Anyway, I don't think I have missed any platforms.  However, if
platform experts could weigh in here to verify my changes (and test
them!) I'd really appreciate it.

In particular, I'm confused about how clock interrupts work on
powerpc, socppc, and sgi.

--

Thoughts?  Platform-specific OKs?

Index: sys/kern/kern_clock.c
===
RCS file: /cvs/src/sys/kern/kern_clock.c,v
retrieving revision 1.101
diff -u -p -r1.101 kern_clock.c
--- sys/kern/kern_clock.c   21 Jan 2020 16:16:23 -  1.101
+++ sys/kern/kern_clock.c   7 Jan 2021 16:37:09 -
@@ -164,12 +164,6 @@ hardclock(struct clockframe *frame)
}
}
 
-   /*
-* If no separate statistics clock is available, run it from here.
-*/
-   if (stathz == 0)
-   statclock(frame);
-
if (--ci->ci_schedstate.spc_rrticks <= 0)
roundrobin(ci);
 
Index: sys/arch/alpha/alpha/clock.c
===
RCS file: /cvs/src/sys/arch/alpha/alpha/clock.c,v
retrieving revision 1.24
diff -u -p -r1.24 clock.c
--- sys/arch/alpha/alpha/clock.c6 Jul 2020 13:33:06 -   1.24
+++ sys/arch/alpha/alpha/clock.c7 Jan 2021 16:37:09 -
@@ -136,6 +136,13 @@ clockattach(dev, fns)
  * Machine-dependent clock routines.
  */
 
+void
+clockintr(struct clockframe *frame)
+{
+   hardclock(frame);
+   statclock(frame);
+}
+
 /*
  * Start the real-time and statistics clocks. Leave stathz 0 since there
  * are no other timers available.
@@ -165,7 +172,7 @@ cpu_initclocks(void)
 * hardclock, which would then fall over because the pointer
 * to the virtual timers wasn't set at that time.
 */
-   platform.clockintr = hardclock;
+   platform.clockintr = clockintr;
schedhz = 16;
 
evcount_attach(_count, "clock", _irq);
Index: sys/arch/amd64/amd64/lapic.c
===
RCS file: /cvs/src/sys/arch/amd64/amd64/lapic.c,v
retrieving revision 1.57
diff -u -p -r1.57 lapic.c
--- sys/arch/amd64/amd64/lapic.c6 Sep 2020 20:50:00 -   1.57
+++ sys/arch/amd64/amd64/lapic.c7 Jan 2021 16:37:09 -
@@ -452,6 +452,7 @@ lapic_clockintr(void *arg, struct intrfr
floor = ci->ci_handled_intr_level;
ci->ci_handled_intr_level = ci->ci_ilevel;
hardclock((struct clockframe *));
+   statclock((struct clockframe *));
ci->ci_handled_intr_level = floor;
 
clk_count.ec_count++;
Index: sys/arch/hppa/dev/clock.c
===
RCS file: /cvs/src/sys/arch/hppa/dev/clock.c,v
retrieving revision 1.31
diff -u -p -r1.31 clock.c
--- sys/arch/hppa/dev/clock.c   6 Jul 2020 13:33:07 -   1.31
+++ sys/arch/hppa/dev/clock.c   7 Jan 2021 16:37:09 -
@@ -43,7 +43,7 @@
 
 u_long cpu_hzticks;
 
-intcpu_hardclock(void *);
+intcpu_clockintr(void *);
 u_int  itmr_get_timecount(struct timecounter *);
 
 struct timecounter itmr_timecounter = {
@@ -106,7 +106,7 @@ cpu_initclocks(void)
 }
 
 int
-cpu_hardclock(void *v)
+cpu_clockintr(void *v)
 {

Re: tpm(4): don't use tvtohz(9)

2021-01-06 Thread Scott Cheloha
On Wed, Jan 06, 2021 at 12:16:13PM -0600, Scott Cheloha wrote:
> As mentioned in a prior mail, tpm(4) is the last user of tvtohz(9) in
> the tree.
> 
> However, we don't need to use tvtohz(9) in tpm(4) at all.  Converting
> from milliseconds to ticks is trivial.  Using an intermediary timeval
> is just pointless indirection.
> 
> With this committed I will be able to remove both tvtohz(9) and
> tstohz(9) from the tree in a subsequent patch.

Whoops, made a mistake in the prior diff.

We want to MAX() the result up to 1 to avoid dividing to zero and
blocking indefinitely in e.g. tsleep(9).  Previous diff MIN'd the
result down to 1, which is very wrong.

Index: tpm.c
===
RCS file: /cvs/src/sys/dev/acpi/tpm.c,v
retrieving revision 1.10
diff -u -p -r1.10 tpm.c
--- tpm.c   22 May 2020 10:16:37 -  1.10
+++ tpm.c   6 Jan 2021 22:09:49 -
@@ -24,6 +24,7 @@
  */
 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -455,12 +456,7 @@ tpm_status(struct tpm_softc *sc)
 int
 tpm_tmotohz(int tmo)
 {
-   struct timeval tv;
-
-   tv.tv_sec = tmo / 1000;
-   tv.tv_usec = 1000 * (tmo % 1000);
-
-   return tvtohz();
+   return MAX(1, tmo * hz / 1000);
 }
 
 int



tpm(4): don't use tvtohz(9)

2021-01-06 Thread Scott Cheloha
As mentioned in a prior mail, tpm(4) is the last user of tvtohz(9) in
the tree.

However, we don't need to use tvtohz(9) in tpm(4) at all.  Converting
from milliseconds to ticks is trivial.  Using an intermediary timeval
is just pointless indirection.

With this committed I will be able to remove both tvtohz(9) and
tstohz(9) from the tree in a subsequent patch.

ok?

Index: tpm.c
===
RCS file: /cvs/src/sys/dev/acpi/tpm.c,v
retrieving revision 1.10
diff -u -p -r1.10 tpm.c
--- tpm.c   22 May 2020 10:16:37 -  1.10
+++ tpm.c   6 Jan 2021 18:06:47 -
@@ -24,6 +24,7 @@
  */
 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -455,12 +456,7 @@ tpm_status(struct tpm_softc *sc)
 int
 tpm_tmotohz(int tmo)
 {
-   struct timeval tv;
-
-   tv.tv_sec = tmo / 1000;
-   tv.tv_usec = 1000 * (tmo % 1000);
-
-   return tvtohz();
+   return MIN(1, tmo * hz / 1000);
 }
 
 int



sleep(3): don't bypass nanosleep(2)

2021-01-06 Thread Scott Cheloha
In sleep(3), if seconds is zero we don't call nanosleep(2).

I don't like this.  If sleep(3) really is a simplified interface to
nanosleep(2) (as we claim in the manpage) we should let nanosleep(2)
handle the input and make decisions.

Other benefits:

- sleep(3) now *always* shows up in ktrace.

- sleep(3) with a zero input now blocks for up to 1 tick, just like
  nanosleep(2) does with a zero input (more intuitive behavior).

- Neither NetBSD nor FreeBSD bypass nanosleep(2) like this, so now our
  sleep(3) is more like theirs.

ok?

Index: sleep.c
===
RCS file: /cvs/src/lib/libc/gen/sleep.c,v
retrieving revision 1.12
diff -u -p -r1.12 sleep.c
--- sleep.c 14 Dec 2009 05:10:13 -  1.12
+++ sleep.c 6 Jan 2021 16:21:11 -
@@ -36,9 +36,6 @@ sleep(unsigned int seconds)
 {
struct timespec rqt, rmt;
 
-   if (seconds == 0)
-   return(0);
-
rqt.tv_sec = seconds;
rqt.tv_nsec = 0;
 



Re: bpf(4): remove ticks

2020-12-28 Thread Scott Cheloha
On Mon, Dec 28, 2020 at 10:49:59AM +1000, David Gwynne wrote:
> On Sat, Dec 26, 2020 at 04:48:23PM -0600, Scott Cheloha wrote:
> > Now that we've removed bd_rdStart from the bpf_d struct, removing
> > ticks from bpf(4) itself is straightforward.
> > 
> > - bd_rtout becomes a timespec; update bpfioctl() accordingly.
> >   Cap it at MAXTSLP nanoseconds to avoid arithmetic overflow
> >   in bpfread().
> > 
> > - At the start of bpfread(), if a timeout is set, find the end
> >   of the read as an absolute uptime.  This is the point where
> >   we want to avoid overflow: if bd_rtout is only MAXTSLP
> >   nanoseconds the timespecadd(3) will effectively never overflow.
> > 
> > - Before going to sleep, if we have a timeout set, compute how
> >   much longer to sleep in nanoseconds.
> > 
> >   Here's a spot where an absolute timeout sleep would save a
> >   little code, but we don't have such an interface yet.  Worth
> >   keeping in mind for the future, though.
> 
> Are there any other places that would be useful though? bpf is pretty
> special.

kqueue_scan() in kern_event.c can have a spurious wakeup, so an
absolute sleep would be useful there.  doppoll() in sys_generic.c is
the same, though I think mpi@/visa@ intend to refactor it to use
kqueue_scan().

In general, if you have a thread that wants to do something on a
strict period you need to use an absolute sleep to avoid drift.

This code drifts:

for (;;) {
do_work();
tsleep_nsec(, PPAUSE, "worker", SEC_TO_NSEC(1));
}

While this code will not:

uint64_t deadline;

deadline = nsecuptime();
for (;;) {
do_work();
deadline = nsec_advance(deadline, SEC_TO_NSEC(1));
tsleep_abs_nsec(, PPAUSE, "worker", deadline);
}

(Some of those interfaces don't actually exist, but they are easy to
write and you can infer how they work.)

Most developers probably do not care about maintaining a strict period
for periodic workloads, but I have a suspicion that it would keep
system performance more deterministic because you don't have various
periodic workloads drifting into one another, overlapping, and
momentarily causing utilization spikes.

I know that probably sounds far-fetched... it's an idea I've been
fussing with.

> > dlg@: You said you wanted to simplify this loop.  Unsure what shape
> > that cleanup would take.  Should we clean it up before this change?
> 
> I wanted to have a single msleep_nsec call and pass INFSLP when it
> should sleep forever.. You saw my first attempt at that. It had
> issues.
> 
> > Thoughts?  ok?
> 
> How would this look if you used a uint64_t for nsecs for bd_rtout,
> and the nsec uptime thing from your pool diff instead of timespecs
> and nanouptime?

See the attached patch.  It is shorter because we can do more inline
stuff with a uint64_t than with a timespec.

> What's the thinking behind nanouptime instead of getnanouptime?

In general we should prefer high resolution time unless there is a
compelling performance reason to use low-res time.

In particular, we should prefer high res time whenever userspace
timeouts are involved as userspace can only use high-res time.

For instance, when tsleep_nsec/etc. are reimplemented with kclock
timeouts (soon!) I will remove the use of low-res time from
nanosleep(2), select(2)/pselect(2), poll(2)/ppoll(2), and kevent(2).
The use of low-res time in these interfaces can cause undersleep right
now.  They're buggy.

> More generally, what will getnanouptime do when ticks go away?

Ticks will probably never "go away" entirely.

In general, some CPU is always going to need to call tc_windup()
regularly to keep time moving forward.  When tc_windup() is called we
update the low-res timestamps.  So getnanouptime(9)/etc. will continue
to work as they do today.  I also imagine that the CPU responsible for
calling tc_windup() will continue to increment `ticks' and `jiffies'.

In the near-ish future I want to add support for dynamic clock
interrupts.  This would permit a CPU to stay idle, or drop into a
deeper power-saving state, for longer than 1 tick if it has no work to
do.  This is not strictly-speaking "tickless" operation, as the clock
interrupt is not disabled.  But it is nice, so it is a near-term goal.

In the Distant Future we could add support for disabling the clock
interrupt on select CPUs.  This might be useful for certain realtime
applications.  This would be "true tickless" operation.  But for this
to be useful we'd need to add support for realtime scheduling policies
(e.g. SCHED_FIFO) and support for binding threads to (and excluding
threads from) particular CPUs.  So think Distant Future, if ever.

--

Anyway, updated patch here.

Index: bpf.c
=

bpf(4): remove ticks

2020-12-26 Thread Scott Cheloha
Now that we've removed bd_rdStart from the bpf_d struct, removing
ticks from bpf(4) itself is straightforward.

- bd_rtout becomes a timespec; update bpfioctl() accordingly.
  Cap it at MAXTSLP nanoseconds to avoid arithmetic overflow
  in bpfread().

- At the start of bpfread(), if a timeout is set, find the end
  of the read as an absolute uptime.  This is the point where
  we want to avoid overflow: if bd_rtout is only MAXTSLP
  nanoseconds the timespecadd(3) will effectively never overflow.

- Before going to sleep, if we have a timeout set, compute how
  much longer to sleep in nanoseconds.

  Here's a spot where an absolute timeout sleep would save a
  little code, but we don't have such an interface yet.  Worth
  keeping in mind for the future, though.

dlg@: You said you wanted to simplify this loop.  Unsure what shape
that cleanup would take.  Should we clean it up before this change?

Thoughts?  ok?

Index: bpf.c
===
RCS file: /cvs/src/sys/net/bpf.c,v
retrieving revision 1.199
diff -u -p -r1.199 bpf.c
--- bpf.c   26 Dec 2020 16:30:58 -  1.199
+++ bpf.c   26 Dec 2020 22:05:04 -
@@ -60,6 +60,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -77,9 +78,6 @@
 
 #define PRINET  26 /* interruptible */
 
-/* from kern/kern_clock.c; incremented each clock tick. */
-extern int ticks;
-
 /*
  * The default read buffer size is patchable.
  */
@@ -380,7 +378,7 @@ bpfopen(dev_t dev, int flag, int mode, s
smr_init(>bd_smr);
sigio_init(>bd_sigio);
 
-   bd->bd_rtout = 0;   /* no timeout by default */
+   timespecclear(>bd_rtout);   /* no timeout by default */
bd->bd_rnonblock = ISSET(flag, FNONBLOCK);
 
bpf_get(bd);
@@ -428,9 +426,11 @@ bpfclose(dev_t dev, int flag, int mode, 
 int
 bpfread(dev_t dev, struct uio *uio, int ioflag)
 {
+   struct timespec diff, end, now;
+   uint64_t nsecs;
struct bpf_d *d;
caddr_t hbuf;
-   int end, error, hlen, nticks;
+   int error, hlen, timeout;
 
KERNEL_ASSERT_LOCKED();
 
@@ -453,8 +453,11 @@ bpfread(dev_t dev, struct uio *uio, int 
/*
 * If there's a timeout, mark when the read should end.
 */
-   if (d->bd_rtout)
-   end = ticks + (int)d->bd_rtout;
+   timeout = timespecisset(>bd_rtout);
+   if (timeout) {
+   nanouptime();
+   timespecadd(, >bd_rtout, );
+   }
 
/*
 * If the hold buffer is empty, then do a timed sleep, which
@@ -483,21 +486,26 @@ bpfread(dev_t dev, struct uio *uio, int 
if (d->bd_rnonblock) {
/* User requested non-blocking I/O */
error = EWOULDBLOCK;
-   } else if (d->bd_rtout == 0) {
+   } else if (timeout == 0) {
/* No read timeout set. */
d->bd_nreaders++;
error = msleep_nsec(d, >bd_mtx, PRINET|PCATCH,
"bpf", INFSLP);
d->bd_nreaders--;
-   } else if ((nticks = end - ticks) > 0) {
-   /* Read timeout has not expired yet. */
-   d->bd_nreaders++;
-   error = msleep(d, >bd_mtx, PRINET|PCATCH, "bpf",
-   nticks);
-   d->bd_nreaders--;
} else {
-   /* Read timeout has expired. */
-   error = EWOULDBLOCK;
+   nanouptime();
+   if (timespeccmp(, , <)) {
+   /* Read timeout has not expired yet. */
+   timespecsub(, , );
+   nsecs = TIMESPEC_TO_NSEC();
+   d->bd_nreaders++;
+   error = msleep_nsec(d, >bd_mtx,
+   PRINET|PCATCH, "bpf", nsecs);
+   d->bd_nreaders--;
+   } else {
+   /* Read timeout has expired. */
+   error = EWOULDBLOCK;
+   }
}
if (error == EINTR || error == ERESTART)
goto out;
@@ -861,27 +869,17 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t 
case BIOCSRTIMEOUT:
{
struct timeval *tv = (struct timeval *)addr;
-   u_long rtout;
 
-   /* Compute number of ticks. */
if (tv->tv_sec < 0 || !timerisvalid(tv)) {
error = EINVAL;
break;
}
-   if (tv->tv_sec > INT_MAX / hz) {
-   error = EOVERFLOW;
-   break;
- 

kernel: more lbolt removal

2020-12-25 Thread Scott Cheloha
Here's some more sleeps that use lbolt that could use  and a
timeout for 1 second.

There are some trickier lbolt sleeps in the tty code that I have
omitted, plus the lbolt sleep in the vfs syncer thread.

Otherwise, lbolt is dead with this patch.

ok?

Index: uvm/uvm_fault.c
===
RCS file: /cvs/src/sys/uvm/uvm_fault.c,v
retrieving revision 1.109
diff -u -p -r1.109 uvm_fault.c
--- uvm/uvm_fault.c 8 Dec 2020 12:26:31 -   1.109
+++ uvm/uvm_fault.c 25 Dec 2020 19:31:15 -
@@ -1121,7 +1121,8 @@ uvm_fault_lower(struct uvm_faultinfo *uf
KASSERT(result != VM_PAGER_PEND);
 
if (result == VM_PAGER_AGAIN) {
-   tsleep_nsec(, PVM, "fltagain2", INFSLP);
+   tsleep_nsec(, PVM, "fltagain2",
+   SEC_TO_NSEC(1));
return ERESTART;
}
 
Index: nfs/nfs_vfsops.c
===
RCS file: /cvs/src/sys/nfs/nfs_vfsops.c,v
retrieving revision 1.125
diff -u -p -r1.125 nfs_vfsops.c
--- nfs/nfs_vfsops.c10 Jan 2020 10:33:35 -  1.125
+++ nfs/nfs_vfsops.c25 Dec 2020 19:31:16 -
@@ -522,7 +522,8 @@ nfs_decode_args(struct nfsmount *nmp, st
if (nmp->nm_sotype == SOCK_DGRAM)
while (nfs_connect(nmp, NULL)) {
printf("nfs_args: retrying connect\n");
-   tsleep_nsec(, PSOCK, "nfscon", INFSLP);
+   tsleep_nsec(, PSOCK, "nfscon",
+   SEC_TO_NSEC(1));
}
}
 
Index: nfs/nfs_socket.c
===
RCS file: /cvs/src/sys/nfs/nfs_socket.c,v
retrieving revision 1.136
diff -u -p -r1.136 nfs_socket.c
--- nfs/nfs_socket.c21 Jan 2020 00:18:13 -  1.136
+++ nfs/nfs_socket.c25 Dec 2020 19:31:16 -
@@ -408,7 +408,7 @@ nfs_reconnect(struct nfsreq *rep)
while ((error = nfs_connect(nmp, rep)) != 0) {
if (error == EINTR || error == ERESTART)
return (EINTR);
-   tsleep_nsec(, PSOCK, "nfsrecon", INFSLP);
+   tsleep_nsec(, PSOCK, "nfsrecon", SEC_TO_NSEC(1));
}
 
/*



Re: sdmmc(4): sdmmc_io_function_enable(): don't sleep on lbolt

2020-12-25 Thread Scott Cheloha
On Wed, Dec 16, 2020 at 10:04:48PM +0100, Mark Kettenis wrote:
> > Date: Wed, 16 Dec 2020 12:50:46 -0600
> > From: Scott Cheloha 
> > 
> > On Tue, Dec 15, 2020 at 01:47:24PM +0100, Mark Kettenis wrote:
> > > > Date: Tue, 15 Dec 2020 13:32:22 +0100
> > > > From: Claudio Jeker 
> > > > 
> > > > On Fri, Dec 11, 2020 at 07:07:56PM -0600, Scott Cheloha wrote:
> > > > > Hi,
> > > > > 
> > > > > I'd like to remove lbolt from the kernel.  I think having it in the
> > > > > kernel complicates otherwise simple code.
> > > > > 
> > > > > We can start with sdmmc(4).
> > > > > 
> > > > > The goal in sdmmc_io_function_enable() is calling 
> > > > > sdmmc_io_function_ready()
> > > > > up to six times and sleep 1 second between each attempt.  Here's 
> > > > > rewritten
> > > > > code that does with without lbolt.
> > > > > 
> > > > > ok?
> > > > > 
> > > > > Index: sdmmc_io.c
> > > > > ===
> > > > > RCS file: /cvs/src/sys/dev/sdmmc/sdmmc_io.c,v
> > > > > retrieving revision 1.41
> > > > > diff -u -p -r1.41 sdmmc_io.c
> > > > > --- sdmmc_io.c31 Dec 2019 10:05:33 -  1.41
> > > > > +++ sdmmc_io.c12 Dec 2020 01:04:59 -
> > > > > @@ -231,8 +231,8 @@ sdmmc_io_function_enable(struct sdmmc_fu
> > > > >  {
> > > > >   struct sdmmc_softc *sc = sf->sc;
> > > > >   struct sdmmc_function *sf0 = sc->sc_fn0;
> > > > > + int chan, retry = 5;
> > > > >   u_int8_t rv;
> > > > > - int retry = 5;
> > > > >  
> > > > >   rw_assert_wrlock(>sc_lock);
> > > > >  
> > > > > @@ -244,7 +244,7 @@ sdmmc_io_function_enable(struct sdmmc_fu
> > > > >   sdmmc_io_write_1(sf0, SD_IO_CCCR_FN_ENABLE, rv);
> > > > >  
> > > > >   while (!sdmmc_io_function_ready(sf) && retry-- > 0)
> > > > > - tsleep_nsec(, PPAUSE, "pause", INFSLP);
> > > > > + tsleep_nsec(, PPAUSE, "pause", SEC_TO_NSEC(1));
> > > > >   return (retry >= 0) ? 0 : ETIMEDOUT;
> > > > >  }
> > > > >  
> > > > 
> > > > Why not use  as wait channel instead of adding a new variable
> > > > chan? Result is the same. Would it make sense to allow NULL as wait
> > > > channel to make the tsleep not wakeable. At least that could be used in 
> > > > a
> > > > few places where timeouts are implemented with tsleep and would make the
> > > > intent more obvious.
> > > 
> > > Or have an appropriately named global variable?  Something like "int 
> > > nowake"?
> > 
> > Something like the attached patch?
> > 
> > I think the idea of a "dead channel" communicates the intent.  Nobody
> > broadcasts wakeups on the dead channel.  If you don't want to receive
> > wakeup broadcasts you sleep on the dead channel.  Hence, "deadchan".
> 
> Yeah, that's a reasonable name.  Not sure if we need the indirection
> though.  Using  isn't an enormous burden I'd say.

Now that we've settled on using , are we ok with the attached?

Index: sdmmc_io.c
===
RCS file: /cvs/src/sys/dev/sdmmc/sdmmc_io.c,v
retrieving revision 1.41
diff -u -p -r1.41 sdmmc_io.c
--- sdmmc_io.c  31 Dec 2019 10:05:33 -  1.41
+++ sdmmc_io.c  25 Dec 2020 17:03:44 -
@@ -244,7 +244,7 @@ sdmmc_io_function_enable(struct sdmmc_fu
sdmmc_io_write_1(sf0, SD_IO_CCCR_FN_ENABLE, rv);
 
while (!sdmmc_io_function_ready(sf) && retry-- > 0)
-   tsleep_nsec(, PPAUSE, "pause", INFSLP);
+   tsleep_nsec(, PPAUSE, "pause", SEC_TO_NSEC(1));
return (retry >= 0) ? 0 : ETIMEDOUT;
 }
 



Re: tsleep(9): add global "nowake" channel

2020-12-23 Thread Scott Cheloha
On Thu, Dec 24, 2020 at 12:24:01AM +0100, Patrick Wildt wrote:
> Am Wed, Dec 23, 2020 at 05:04:23PM -0600 schrieb Scott Cheloha:
> > On Wed, Dec 23, 2020 at 02:42:18PM -0700, Theo de Raadt wrote:
> > > I agree.  This chunk below is really gross and does not follow the
> > > special wakeup channel metaphor.
> > > 
> > > It is *entirely clear* that a  called "nowake" has no wakeup.
> > > Like duh.
> > > 
> > > > +/*
> > > > + * nowake is a global sleep channel for threads that do not want
> > > > + * to receive wakeup(9) broadcasts.
> > > > + */
> > > > +int __nowake;
> > > > +void *nowake = &__nowake;
> > 
> > So we'll go with this?
> > 
> > Index: kern/kern_synch.c
> > ===
> > RCS file: /cvs/src/sys/kern/kern_synch.c,v
> > retrieving revision 1.172
> > diff -u -p -r1.172 kern_synch.c
> > --- kern/kern_synch.c   7 Dec 2020 16:55:29 -   1.172
> > +++ kern/kern_synch.c   23 Dec 2020 23:03:31 -
> > @@ -87,6 +87,11 @@ sleep_queue_init(void)
> > TAILQ_INIT([i]);
> >  }
> >  
> > +/*
> > + * Global sleep channel for threads that do not want to
> > + * receive wakeup(9) broadcasts.
> > + */
> > +int nowake;
> >  
> >  /*
> >   * During autoconfiguration or after a panic, a sleep will simply
> > @@ -119,6 +124,7 @@ tsleep(const volatile void *ident, int p
> >  #endif
> >  
> > KASSERT((priority & ~(PRIMASK | PCATCH)) == 0);
> > +   KASSERT(ident != nowake || ISSET(priority, PCATCH) || timo != 0);
> 
> Sure you compiled this? ident is void *, nowake is int.  Should be ident
> != ?  Same for the other code in the diff.

Whoops.  Thought I had compiled it.

Yes, I meant 

Index: kern/kern_synch.c
===
RCS file: /cvs/src/sys/kern/kern_synch.c,v
retrieving revision 1.172
diff -u -p -r1.172 kern_synch.c
--- kern/kern_synch.c   7 Dec 2020 16:55:29 -   1.172
+++ kern/kern_synch.c   23 Dec 2020 23:12:10 -
@@ -87,6 +87,11 @@ sleep_queue_init(void)
TAILQ_INIT([i]);
 }
 
+/*
+ * Global sleep channel for threads that do not want to
+ * receive wakeup(9) broadcasts.
+ */
+int nowake;
 
 /*
  * During autoconfiguration or after a panic, a sleep will simply
@@ -119,6 +124,7 @@ tsleep(const volatile void *ident, int p
 #endif
 
KASSERT((priority & ~(PRIMASK | PCATCH)) == 0);
+   KASSERT(ident !=  || ISSET(priority, PCATCH) || timo != 0);
 
 #ifdef MULTIPROCESSOR
KASSERT(timo || _kernel_lock_held());
@@ -213,6 +219,7 @@ msleep(const volatile void *ident, struc
 #endif
 
KASSERT((priority & ~(PRIMASK | PCATCH | PNORELOCK)) == 0);
+   KASSERT(ident !=  || ISSET(priority, PCATCH) || timo != 0);
KASSERT(mtx != NULL);
 
if (priority & PCATCH)
@@ -301,6 +308,7 @@ rwsleep(const volatile void *ident, stru
int error, status;
 
KASSERT((priority & ~(PRIMASK | PCATCH | PNORELOCK)) == 0);
+   KASSERT(ident !=  || ISSET(priority, PCATCH) || timo != 0);
rw_assert_anylock(rwl);
status = rw_status(rwl);
 
Index: sys/systm.h
===
RCS file: /cvs/src/sys/sys/systm.h,v
retrieving revision 1.148
diff -u -p -r1.148 systm.h
--- sys/systm.h 26 Aug 2020 03:29:07 -  1.148
+++ sys/systm.h 23 Dec 2020 23:12:10 -
@@ -107,6 +107,8 @@ extern struct vnode *rootvp;/* vnode eq
 extern dev_t swapdev;  /* swapping device */
 extern struct vnode *swapdev_vp;/* vnode equivalent to above */
 
+extern int nowake; /* dead wakeup(9) channel */
+
 struct proc;
 struct process;
 #define curproc curcpu()->ci_curproc



Re: tsleep(9): add global "nowake" channel

2020-12-23 Thread Scott Cheloha
On Wed, Dec 23, 2020 at 02:42:18PM -0700, Theo de Raadt wrote:
> I agree.  This chunk below is really gross and does not follow the
> special wakeup channel metaphor.
> 
> It is *entirely clear* that a  called "nowake" has no wakeup.
> Like duh.
> 
> > +/*
> > + * nowake is a global sleep channel for threads that do not want
> > + * to receive wakeup(9) broadcasts.
> > + */
> > +int __nowake;
> > +void *nowake = &__nowake;

So we'll go with this?

Index: kern/kern_synch.c
===
RCS file: /cvs/src/sys/kern/kern_synch.c,v
retrieving revision 1.172
diff -u -p -r1.172 kern_synch.c
--- kern/kern_synch.c   7 Dec 2020 16:55:29 -   1.172
+++ kern/kern_synch.c   23 Dec 2020 23:03:31 -
@@ -87,6 +87,11 @@ sleep_queue_init(void)
TAILQ_INIT([i]);
 }
 
+/*
+ * Global sleep channel for threads that do not want to
+ * receive wakeup(9) broadcasts.
+ */
+int nowake;
 
 /*
  * During autoconfiguration or after a panic, a sleep will simply
@@ -119,6 +124,7 @@ tsleep(const volatile void *ident, int p
 #endif
 
KASSERT((priority & ~(PRIMASK | PCATCH)) == 0);
+   KASSERT(ident != nowake || ISSET(priority, PCATCH) || timo != 0);
 
 #ifdef MULTIPROCESSOR
KASSERT(timo || _kernel_lock_held());
@@ -213,6 +219,7 @@ msleep(const volatile void *ident, struc
 #endif
 
KASSERT((priority & ~(PRIMASK | PCATCH | PNORELOCK)) == 0);
+   KASSERT(ident != nowake || ISSET(priority, PCATCH) || timo != 0);
KASSERT(mtx != NULL);
 
if (priority & PCATCH)
@@ -301,6 +308,7 @@ rwsleep(const volatile void *ident, stru
int error, status;
 
KASSERT((priority & ~(PRIMASK | PCATCH | PNORELOCK)) == 0);
+   KASSERT(ident != nowake || ISSET(priority, PCATCH) || timo != 0);
rw_assert_anylock(rwl);
status = rw_status(rwl);
 
Index: sys/systm.h
===
RCS file: /cvs/src/sys/sys/systm.h,v
retrieving revision 1.148
diff -u -p -r1.148 systm.h
--- sys/systm.h 26 Aug 2020 03:29:07 -  1.148
+++ sys/systm.h 23 Dec 2020 23:03:31 -
@@ -107,6 +107,8 @@ extern struct vnode *rootvp;/* vnode eq
 extern dev_t swapdev;  /* swapping device */
 extern struct vnode *swapdev_vp;/* vnode equivalent to above */
 
+extern int nowake; /* dead wakeup(9) channel */
+
 struct proc;
 struct process;
 #define curproc curcpu()->ci_curproc



Re: i386: apm(4): apm_thread(): sleep without lbolt

2020-12-23 Thread Scott Cheloha
On Tue, Dec 15, 2020 at 09:15:31AM -0300, Martin Pieuchot wrote:
> On 11/12/20(Fri) 19:17, Scott Cheloha wrote:
> > Here's another sleep that doesn't need lbolt.
> > 
> > The idea here is to call apm_periodic_check() once a second.
> > We can do that without lbolt.
> > 
> > Is there some other address that would be more appropriate for this
> > thread to sleep on?  It doesn't look like any apm(4) code calls
> > wakeup(9) on lbolt so I've just replaced with with a local channel.
> 
> Note sure we want to grow the stack just for that.  Any member of `sc',
> or even `sc' itself if this doesn't conflict, could be used as wait
> channel.

Assuming we go ahead with the global nowake channel, is this ok?

Index: apm.c
===
RCS file: /cvs/src/sys/arch/i386/i386/apm.c,v
retrieving revision 1.125
diff -u -p -r1.125 apm.c
--- apm.c   24 Jun 2020 22:03:40 -  1.125
+++ apm.c   23 Dec 2020 21:03:50 -
@@ -909,7 +909,7 @@ apm_thread(void *v)
rw_enter_write(>sc_lock);
(void) apm_periodic_check(sc);
rw_exit_write(>sc_lock);
-   tsleep_nsec(, PWAIT, "apmev", INFSLP);
+   tsleep_nsec(nowake, PWAIT, "apmev", SEC_TO_NSEC(1));
}
 }
 



tsleep(9): add global "nowake" channel

2020-12-23 Thread Scott Cheloha
Okay, let's try one more time.

This patch adds a global sleep channel, "nowake", for sleeping threads
that don't want to receive wakeup(9) broadcasts.

You use it like this:

#include 

tsleep(nowake, ...);

I've added additional assertions to tsleep, msleep, and rwsleep that
ensures that there is *some* way to wake up the thread.  You need
either an ident that is not nowake, PCATCH, or a timeout.

I prefer using indirection here to save a character when using it as
the ident, i.e. "nowake" is shorter than "".

I'll document it when it is used more broadly in the kernel.

ok?

Index: kern/kern_synch.c
===
RCS file: /cvs/src/sys/kern/kern_synch.c,v
retrieving revision 1.172
diff -u -p -r1.172 kern_synch.c
--- kern/kern_synch.c   7 Dec 2020 16:55:29 -   1.172
+++ kern/kern_synch.c   23 Dec 2020 20:56:35 -
@@ -87,6 +87,12 @@ sleep_queue_init(void)
TAILQ_INIT([i]);
 }
 
+/*
+ * nowake is a global sleep channel for threads that do not want
+ * to receive wakeup(9) broadcasts.
+ */
+int __nowake;
+void *nowake = &__nowake;
 
 /*
  * During autoconfiguration or after a panic, a sleep will simply
@@ -119,6 +125,7 @@ tsleep(const volatile void *ident, int p
 #endif
 
KASSERT((priority & ~(PRIMASK | PCATCH)) == 0);
+   KASSERT(ident != nowake || ISSET(priority, PCATCH) || timo != 0);
 
 #ifdef MULTIPROCESSOR
KASSERT(timo || _kernel_lock_held());
@@ -213,6 +220,7 @@ msleep(const volatile void *ident, struc
 #endif
 
KASSERT((priority & ~(PRIMASK | PCATCH | PNORELOCK)) == 0);
+   KASSERT(ident != nowake || ISSET(priority, PCATCH) || timo != 0);
KASSERT(mtx != NULL);
 
if (priority & PCATCH)
@@ -301,6 +309,7 @@ rwsleep(const volatile void *ident, stru
int error, status;
 
KASSERT((priority & ~(PRIMASK | PCATCH | PNORELOCK)) == 0);
+   KASSERT(ident != nowake || ISSET(priority, PCATCH) || timo != 0);
rw_assert_anylock(rwl);
status = rw_status(rwl);
 
Index: sys/systm.h
===
RCS file: /cvs/src/sys/sys/systm.h,v
retrieving revision 1.148
diff -u -p -r1.148 systm.h
--- sys/systm.h 26 Aug 2020 03:29:07 -  1.148
+++ sys/systm.h 23 Dec 2020 20:56:35 -
@@ -107,6 +107,8 @@ extern struct vnode *rootvp;/* vnode eq
 extern dev_t swapdev;  /* swapping device */
 extern struct vnode *swapdev_vp;/* vnode equivalent to above */
 
+extern void *nowake;   /* dead wakeup(9) channel */
+
 struct proc;
 struct process;
 #define curproc curcpu()->ci_curproc



Re: pool(9): remove ticks (attempt 2)

2020-12-23 Thread Scott Cheloha
On Fri, Dec 11, 2020 at 05:32:54PM -0600, Scott Cheloha wrote:
> On Fri, Dec 11, 2020 at 07:52:45PM +0100, Mark Kettenis wrote:
> > > Date: Fri, 11 Dec 2020 11:51:54 -0600
> > > From: Scott Cheloha 
> > > 
> > > On Fri, Dec 11, 2020 at 09:49:07AM -0300, Martin Pieuchot wrote:
> > > > 
> > > > I'm not sure to understand, can't we do:
> > > > 
> > > > pool_wait_free = SEC_TO_NSEC(1);
> > > > pool_wait_gc = SEC_TO_NSEC(8);
> > > > 
> > > [...]
> > > 
> > > We can do that at runtime but not at compile time.  SEC_TO_NSEC(1)
> > > isn't a constant so that won't compile (I just tried).
> > > 
> > > We _could_ do something like this:
> > > 
> > > #define POOL_WAIT_FREESEC_TO_NSEC(1)
> > > 
> > > I think the compiler will probably inline the result and elide the
> > > overflow check because the input is a constant.  I don't know how to
> > > verify this, but my limited understanding of compilers suggests that
> > > this is totally possible.
> > 
> > Yes.  The consequence of that is that the values are no longer
> > patchable.  That may not be very important though (I never really use
> > that possibility).
> 
> What do you mean by "patchable"?  I assume you don't mean the source
> code.
> 
> (Also, you did not comment on the struct stuff below so I'm proceeding
> with the impression there's nothing at issue there.)

Hearing nothing after two weeks I assume nobody cares if timeouts are
no longer patchable.

Looking for OKs on the attached patch.

CC tedu@/dlg@, who added these timeouts to pool(9) in the first place:

https://github.com/openbsd/src/commit/786f6c84e747ccb9777ad35d2b01160679aec089

Index: sys/pool.h
===
RCS file: /cvs/src/sys/sys/pool.h,v
retrieving revision 1.77
diff -u -p -r1.77 pool.h
--- sys/pool.h  19 Jul 2019 09:03:03 -  1.77
+++ sys/pool.h  23 Dec 2020 17:06:19 -
@@ -201,9 +201,9 @@ struct pool {
u_int   pr_cache_items; /* target list length */
u_int   pr_cache_contention;
u_int   pr_cache_contention_prev;
-   int pr_cache_tick;  /* time idle list was empty */
-   int pr_cache_nout;
+   uint64_tpr_cache_timestamp; /* time idle list was empty */
uint64_tpr_cache_ngc;   /* # of times the gc released a list */
+   int pr_cache_nout;
 
u_int   pr_align;
u_int   pr_maxcolors;   /* Cache coloring */
Index: kern/subr_pool.c
===
RCS file: /cvs/src/sys/kern/subr_pool.c,v
retrieving revision 1.230
diff -u -p -r1.230 subr_pool.c
--- kern/subr_pool.c24 Jan 2020 06:31:17 -  1.230
+++ kern/subr_pool.c23 Dec 2020 17:06:20 -
@@ -41,6 +41,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -148,7 +149,7 @@ struct pool_page_header {
caddr_t ph_page;/* this page's address */
caddr_t ph_colored; /* page's colored address */
unsigned long   ph_magic;
-   int ph_tick;
+   uint64_tph_timestamp;
 };
 #define POOL_MAGICBIT (1 << 3) /* keep away from perturbed low bits */
 #define POOL_PHPOISON(ph) ISSET((ph)->ph_magic, POOL_MAGICBIT)
@@ -266,8 +267,22 @@ void   pool_gc_sched(void *);
 struct timeout pool_gc_tick = TIMEOUT_INITIALIZER(pool_gc_sched, NULL);
 void   pool_gc_pages(void *);
 struct task pool_gc_task = TASK_INITIALIZER(pool_gc_pages, NULL);
-int pool_wait_free = 1;
-int pool_wait_gc = 8;
+
+#define POOL_WAIT_FREE SEC_TO_NSEC(1)
+#define POOL_WAIT_GC   SEC_TO_NSEC(8)
+
+/*
+ * TODO Move getnsecuptime() to kern_tc.c and document it when we
+ * have callers in other modules.
+ */
+static uint64_t
+getnsecuptime(void)
+{
+   struct timespec now;
+
+   getnanouptime();
+   return TIMESPEC_TO_NSEC();
+}
 
 RBT_PROTOTYPE(phtree, pool_page_header, ph_node, phtree_compare);
 
@@ -797,7 +812,7 @@ pool_put(struct pool *pp, void *v)
/* is it time to free a page? */
if (pp->pr_nidle > pp->pr_maxpages &&
(ph = TAILQ_FIRST(>pr_emptypages)) != NULL &&
-   (ticks - ph->ph_tick) > (hz * pool_wait_free)) {
+   getnsecuptime() - ph->ph_timestamp > POOL_WAIT_FREE) {
freeph = ph;
pool_p_remove(pp, freeph);
}
@@ -864,7 +879,7 @@ pool_do_put(struct pool *pp, void *v)
 */
pp->pr_nidle++;
 
-   ph->ph_tick = ticks;
+   ph->ph_timestamp = getnsecuptime();
   

prototype of delay(9) is inconsistent

2020-12-21 Thread Scott Cheloha
The manpage for delay(9) suggests that the prototype is:

void delay(int);

But on armv7, arm64, hppa, macppc, and powerpc64 the input is unsigned
or a u_int instead of an int.  Like this:

void delay(unsigned);

or this:

void delay(u_int);

Can we pick a prototype and stick to it?

An upside of an unsigned input is a larger usable input range.  A
negative input to delay(9) makes no sense anyway.  On the other hand,
we could KASSERT a non-negative value and catch bugs if the input were
signed.  Also, the input should, in general, be small.  We probably
don't really need the extra range.  A large input might itself
indicate a bug.

In any case, I want the prototypes to match up across platforms.

What do people prefer?  Signed int or unsigned int?



[please test] acpi: more *sleep(9) -> *sleep_nsec(9) conversions

2020-12-20 Thread Scott Cheloha
Short version:

Please test if this patch breaks suspend or hibernate on your
machine.  Reply with the results of your test and your dmesg.

Both are still working on my Lenovo Thinkpad X1 Carbon Gen 6.

Long version:

This patch converts the remaining tick sleeps in dev/acpi to
nanosecond sleeps.

In acpiec_wait() we have a tsleep(9) for up to 1 tick.  There is no
discernable unit in this code and no timeout so as with all the other
1 tick sleeps I think tsleep_nsec(9) for 1ms will work fine.  It may
oversleep 1ms but because there is no timeout it doesn't really
matter.

In acpi_event_wait() we have a timeout in milliseconds.  Currently we
sleep for up to 1 tick and deduct time from the timeout when the sleep
returns EWOULDBLOCK.  Of note here is that this code is broken on
kernels where hz > 1000: we will never time out.

I have changed the code to rwsleep_nsec(9) for at least 1ms.  The
timeout is in milliseconds so this seems logical to me.

The caveat is that on default HZ=100 kernels we will oversleep and it
will take much longer for us to time out in this loop.  We can fix
this by measuring the sleep and deducting the elapsed time from the
total timeout.  Or, if it doesn't really matter, we can just oversleep
and not worry about it.

Preferences?

Assuming we get no bad tests back, OK?

Index: acpiec.c
===
RCS file: /cvs/src/sys/dev/acpi/acpiec.c,v
retrieving revision 1.62
diff -u -p -r1.62 acpiec.c
--- acpiec.c26 Aug 2020 03:29:06 -  1.62
+++ acpiec.c20 Dec 2020 22:19:14 -
@@ -105,8 +105,10 @@ acpiec_wait(struct acpiec_softc *sc, uin
sc->sc_gotsci = 1;
if (cold || (stat & EC_STAT_BURST))
delay(1);
-   else
-   tsleep(, PWAIT, "acpiec", 1);
+   else {
+   tsleep_nsec(, PWAIT, "acpiec",
+   MSEC_TO_NSEC(1));
+   }
}
 
dnprintf(40, "%s: EC wait_ns, stat: %b\n", DEVNAME(sc), (int)stat,
Index: dsdt.c
===
RCS file: /cvs/src/sys/dev/acpi/dsdt.c,v
retrieving revision 1.257
diff -u -p -r1.257 dsdt.c
--- dsdt.c  17 Dec 2020 17:57:19 -  1.257
+++ dsdt.c  20 Dec 2020 22:19:15 -
@@ -2907,10 +2907,10 @@ acpi_event_wait(struct aml_scope *scope,
if (acpi_dotask(acpi_softc))
continue;
if (!cold) {
-   if (rwsleep(evt, _softc->sc_lck, PWAIT,
-   "acpievt", 1) == EWOULDBLOCK) {
+   if (rwsleep_nsec(evt, _softc->sc_lck, PWAIT,
+   "acpievt", MSEC_TO_NSEC(1)) == EWOULDBLOCK) {
if (timeout < AML_NO_TIMEOUT)
-   timeout -= (1000 / hz);
+   timeout--;
}
} else {
delay(1000);



Re: sigsuspend(2): use "sigsuspend" for sleep string

2020-12-20 Thread Scott Cheloha
On Sun, Dec 20, 2020 at 10:11:07PM +0100, Mark Kettenis wrote:
> > Date: Sun, 20 Dec 2020 14:53:16 -0600
> > From: Scott Cheloha 
> > 
> > I want to see if a process is waiting in sigsuspend(2) from top(1).
> > The current sleep string is "pause", which leaves me wondering what
> > the process is actually doing.  The string "sigsuspend" would make it
> > unambiguous.
> > 
> > ok?
> 
> No this is too long as it will get truncated.  KI_WMESGLEN is 8, so
> "sigsusp" would work.

Whoops, sure thing.

While we're at it, can I truncate "nanosleep" (9) to "nanoslp" (7)?

Index: kern_sig.c
===
RCS file: /cvs/src/sys/kern/kern_sig.c,v
retrieving revision 1.268
diff -u -p -r1.268 kern_sig.c
--- kern_sig.c  7 Dec 2020 16:55:28 -   1.268
+++ kern_sig.c  20 Dec 2020 21:27:52 -
@@ -523,7 +523,7 @@ sys_sigsuspend(struct proc *p, void *v, 
struct sigacts *ps = pr->ps_sigacts;
 
dosigsuspend(p, SCARG(uap, mask) &~ sigcantmask);
-   while (tsleep_nsec(ps, PPAUSE|PCATCH, "pause", INFSLP) == 0)
+   while (tsleep_nsec(ps, PPAUSE|PCATCH, "sigsusp", INFSLP) == 0)
/* void */;
/* always return EINTR rather than ERESTART... */
return (EINTR);
Index: kern_time.c
===
RCS file: /cvs/src/sys/kern/kern_time.c,v
retrieving revision 1.150
diff -u -p -r1.150 kern_time.c
--- kern_time.c 10 Nov 2020 17:26:54 -  1.150
+++ kern_time.c 20 Dec 2020 21:27:52 -
@@ -294,7 +294,7 @@ sys_nanosleep(struct proc *p, void *v, r
do {
getnanouptime();
nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(), MAXTSLP));
-   error = tsleep_nsec(, PWAIT | PCATCH, "nanosleep", nsecs);
+   error = tsleep_nsec(, PWAIT | PCATCH, "nanoslp", nsecs);
getnanouptime();
timespecsub(, , );
timespecsub(, , );



sigsuspend(2): use "sigsuspend" for sleep string

2020-12-20 Thread Scott Cheloha
I want to see if a process is waiting in sigsuspend(2) from top(1).
The current sleep string is "pause", which leaves me wondering what
the process is actually doing.  The string "sigsuspend" would make it
unambiguous.

ok?

Index: kern_sig.c
===
RCS file: /cvs/src/sys/kern/kern_sig.c,v
retrieving revision 1.268
diff -u -p -r1.268 kern_sig.c
--- kern_sig.c  7 Dec 2020 16:55:28 -   1.268
+++ kern_sig.c  20 Dec 2020 20:49:44 -
@@ -523,7 +523,7 @@ sys_sigsuspend(struct proc *p, void *v, 
struct sigacts *ps = pr->ps_sigacts;
 
dosigsuspend(p, SCARG(uap, mask) &~ sigcantmask);
-   while (tsleep_nsec(ps, PPAUSE|PCATCH, "pause", INFSLP) == 0)
+   while (tsleep_nsec(ps, PPAUSE|PCATCH, "sigsuspend", INFSLP) == 0)
/* void */;
/* always return EINTR rather than ERESTART... */
return (EINTR);



Re: tpm(4): removing tvtohz(9)?

2020-12-19 Thread Scott Cheloha
> On Dec 18, 2020, at 20:16, joshua stein  wrote:
> 
> On Fri, 18 Dec 2020 at 18:58:43 -0600, Scott Cheloha wrote:
>> Hi,
>> 
>> tpm(4) is the last driver in the tree using tvtohz(9).  There are no
>> remaining callers using tstohz(9), so if and when we remove tvtohz(9)
>> from tpm(4) we can remove both interfaces from the tree.
>> 
>> tpm(4) is tricky because it converts timeouts from milliseconds to
>> ticks and then doesn't use tsleep(9) at all.  It uses delay(9), which
>> takes a count of microseconds as argument.  This complicates the
>> conversion to tsleep_nsec(9) because the units don't match up for any
>> of these delays.  Also, delay(9) is incompatible with tsleep(9)
>> because tsleep(9) yields the CPU while delay(9) busy-waits.
>> 
>> I don't know if we *need* to delay(9) here.  What would happen if we
>> yielded the CPU with e.g. tsleep(9)?
>> 
>> The attached patch changes the delays to use the correct units.  This
>> is not the right thing, these timeouts are probably too large to spin
>> for in delay(9).  I'm just guessing here.
>> 
>> Aside: TPM_READ_TMO is *huge*.  2 minutes for a read timeout seems a
>> bit large.  NetBSD's TPM_READ_TMO has been dropped to 2 seconds, like
>> the other timeouts.
> 
> Yes, this driver sucks.  Its only purpose is to make certain devices 
> suspend and resume properly and doesn't provide any actual TPM 
> functionality.
> 
> We (someone other than me) should just take NetBSD's rewrite which 
> also adds TPM 2 support and apparently works on MSFT0101 devices 
> which was committed and backed out in ours because it didn't work.

Hmmm.  I guess I'll merge their latest code
and see if I can get it working.



tpm(4): removing tvtohz(9)?

2020-12-18 Thread Scott Cheloha
Hi,

tpm(4) is the last driver in the tree using tvtohz(9).  There are no
remaining callers using tstohz(9), so if and when we remove tvtohz(9)
from tpm(4) we can remove both interfaces from the tree.

tpm(4) is tricky because it converts timeouts from milliseconds to
ticks and then doesn't use tsleep(9) at all.  It uses delay(9), which
takes a count of microseconds as argument.  This complicates the
conversion to tsleep_nsec(9) because the units don't match up for any
of these delays.  Also, delay(9) is incompatible with tsleep(9)
because tsleep(9) yields the CPU while delay(9) busy-waits.

I don't know if we *need* to delay(9) here.  What would happen if we
yielded the CPU with e.g. tsleep(9)?

The attached patch changes the delays to use the correct units.  This
is not the right thing, these timeouts are probably too large to spin
for in delay(9).  I'm just guessing here.

Aside: TPM_READ_TMO is *huge*.  2 minutes for a read timeout seems a
bit large.  NetBSD's TPM_READ_TMO has been dropped to 2 seconds, like
the other timeouts.

Also perhaps of note is that NetBSD's tpm(4) driver mostly no longer
uses delay(9).  They use tsleep(9) in all but one spot:

https://github.com/NetBSD/src/blob/fc83762bc464be0bf351901b2c387a8cfedff7c4/sys/dev/ic/tpm.c

Index: tpm.c
===
RCS file: /cvs/src/sys/dev/acpi/tpm.c,v
retrieving revision 1.10
diff -u -p -r1.10 tpm.c
--- tpm.c   22 May 2020 10:16:37 -  1.10
+++ tpm.c   19 Dec 2020 00:56:02 -
@@ -158,7 +158,6 @@ int tpm_request_locality(struct tpm_soft
 void   tpm_release_locality(struct tpm_softc *);
 inttpm_getburst(struct tpm_softc *);
 uint8_ttpm_status(struct tpm_softc *);
-inttpm_tmotohz(int);
 
 struct cfattach tpm_ca = {
sizeof(struct tpm_softc),
@@ -372,7 +371,7 @@ int
 tpm_request_locality(struct tpm_softc *sc, int l)
 {
uint32_t r;
-   int to;
+   int msecs;
 
if (l != 0)
return EINVAL;
@@ -385,12 +384,12 @@ tpm_request_locality(struct tpm_softc *s
bus_space_write_1(sc->sc_bt, sc->sc_bh, TPM_ACCESS,
TPM_ACCESS_REQUEST_USE);
 
-   to = tpm_tmotohz(TPM_ACCESS_TMO);
-
-   while ((r = bus_space_read_1(sc->sc_bt, sc->sc_bh, TPM_ACCESS) &
-   (TPM_ACCESS_VALID | TPM_ACCESS_ACTIVE_LOCALITY)) !=
-   (TPM_ACCESS_VALID | TPM_ACCESS_ACTIVE_LOCALITY) && to--) {
-   DELAY(10);
+   for (msecs = 0; msecs < TPM_ACCESS_TMO; msecs++) {
+   r = bus_space_read_1(sc->sc_bt, sc->sc_bh, TPM_ACCESS);
+   if ((r & (TPM_ACCESS_VALID | TPM_ACCESS_ACTIVE_LOCALITY)) ==
+   (TPM_ACCESS_VALID | TPM_ACCESS_ACTIVE_LOCALITY))
+   break;
+   DELAY(1000);
}
 
if ((r & (TPM_ACCESS_VALID | TPM_ACCESS_ACTIVE_LOCALITY)) !=
@@ -418,12 +417,10 @@ tpm_release_locality(struct tpm_softc *s
 int
 tpm_getburst(struct tpm_softc *sc)
 {
-   int burst, burst2, to;
-
-   to = tpm_tmotohz(TPM_BURST_TMO);
+   int burst, burst2, msecs;
 
burst = 0;
-   while (burst == 0 && to--) {
+   for (msecs = 0; msecs < TPM_BURST_TMO; msecs++) {
/*
 * Burst count has to be read from bits 8 to 23 without
 * touching any other bits, eg. the actual status bits 0 to 7.
@@ -438,7 +435,7 @@ tpm_getburst(struct tpm_softc *sc)
if (burst)
return burst;
 
-   DELAY(10);
+   DELAY(1000);
}
 
DPRINTF(("%s: getburst timed out\n", sc->sc_dev.dv_xname));
@@ -453,30 +450,19 @@ tpm_status(struct tpm_softc *sc)
 }
 
 int
-tpm_tmotohz(int tmo)
-{
-   struct timeval tv;
-
-   tv.tv_sec = tmo / 1000;
-   tv.tv_usec = 1000 * (tmo % 1000);
-
-   return tvtohz();
-}
-
-int
-tpm_waitfor(struct tpm_softc *sc, uint8_t mask, int tries)
+tpm_waitfor(struct tpm_softc *sc, uint8_t mask, int msecs)
 {
uint8_t status;
 
while (((status = tpm_status(sc)) & mask) != mask) {
-   if (tries == 0) {
+   if (msecs <= 0) {
DPRINTF(("%s: %s: timed out, status 0x%x != 0x%x\n",
sc->sc_dev.dv_xname, __func__, status, mask));
return status;
}
 
-   tries--;
-   DELAY(1);
+   msecs -= 1000;
+   DELAY(1000);
}
 
return 0;



tsleep(9): sleep on private channel if ident is NULL

2020-12-18 Thread Scott Cheloha
Hi,

This patch adds support for passing NULL as the ident when calling
tsleep(9) etc.  When this happens, sleep_setup() will use the address
of the sleep_state struct as the value for p_wchan.  This address is
basically always a private value so the thread should never receive a
wakeup(9) broadcast.

Why do we want this?  Sometimes there is no logical ident to sleep on.
Sometimes there is legitimately no reason to receive a wakeup(9).

In the past, people have handrolled private channels to work around
this situation.  The code often looks like this:

void
foo(void)
{
int chan;

tsleep(, ...);
}

Permitting the use of NULL and letting the implementation choose a
private channel is better than handrolling a private channel for two
reasons:

1. We save a bit of stack space.  tsleep(9) etc. already have a
   sleep_state struct on the stack and it's at a private address
   so there is no space cost to use it.

2. The NULL clearly communicates the author's intent to the reader.
   It indicates the author had no wakeup channel in mind when they
   wrote the code.  The reader then doesn't need to reason about
   whether or not the ident value is superfluous.  Poring over
   a file (or several) to determine whether any thread ever calls
   wakeup(9) on a given ident sucks.

FreeBSD/NetBSD have a dedicated interface for this "sleep without a
wakeup channel" operation.  They call it "pause".  I proposed adding
it but I got mixed feedback on the patch.  Then mpi@ proposed this
idea.  I think this is simpler and better.

The actual implementation requires just a few small changes to
sleep_setup().

I've added an additional KASSERT to each of tsleep(9), msleep(9), and
rwsleep(9).  You now need at least one of (a) an ident or (b) PCATCH
or (c) a timeout, otherwise there is no way to get the thread started
again.  This would indicate a programmer error and we should panic if
it ever happens.

I've documented the new NULL ident behavior in tsleep.9.

Also included here is a sample user, sys_nanosleep().  nanosleep(2)
wakes up due to interruption by signal or timeout.  It should never be
awoken with wakeup(9).  Up until now we had a private channel on the
stack.  Now we can just pass NULL.  It's simpler.

There are a bunch of other potential users but they can wait until a
later patch.

I'm running with this now so I'm pretty sure this is a sound change.
Feel free to test it out.  nanosleep(2) gets called all the time so if
there was an issue I imagine it'd show up pretty quickly.

Thoughts?  ok?

Index: share/man/man9/tsleep.9
===
RCS file: /cvs/src/share/man/man9/tsleep.9,v
retrieving revision 1.15
diff -u -p -r1.15 tsleep.9
--- share/man/man9/tsleep.9 20 Mar 2020 03:37:09 -  1.15
+++ share/man/man9/tsleep.9 18 Dec 2020 19:40:04 -
@@ -144,8 +144,11 @@ to the resource for which the process is
 The same identifier must be used in a call to
 .Fn wakeup
 to get the process going again.
+If the thread does not want to receive any
+.Fn wakeup
+broadcasts,
 .Fa ident
-should not be
+should be
 .Dv NULL .
 .It Fa priority
 The process priority to be used when the process is awakened and put on
Index: sys/kern/kern_synch.c
===
RCS file: /cvs/src/sys/kern/kern_synch.c,v
retrieving revision 1.172
diff -u -p -r1.172 kern_synch.c
--- sys/kern/kern_synch.c   7 Dec 2020 16:55:29 -   1.172
+++ sys/kern/kern_synch.c   18 Dec 2020 19:40:04 -
@@ -119,6 +119,7 @@ tsleep(const volatile void *ident, int p
 #endif
 
KASSERT((priority & ~(PRIMASK | PCATCH)) == 0);
+   KASSERT(ident != NULL || ISSET(priority, PCATCH) || timo != 0);
 
 #ifdef MULTIPROCESSOR
KASSERT(timo || _kernel_lock_held());
@@ -214,6 +215,7 @@ msleep(const volatile void *ident, struc
 
KASSERT((priority & ~(PRIMASK | PCATCH | PNORELOCK)) == 0);
KASSERT(mtx != NULL);
+   KASSERT(ident != NULL || ISSET(priority, PCATCH) || timo != 0);
 
if (priority & PCATCH)
KERNEL_ASSERT_LOCKED();
@@ -301,6 +303,7 @@ rwsleep(const volatile void *ident, stru
int error, status;
 
KASSERT((priority & ~(PRIMASK | PCATCH | PNORELOCK)) == 0);
+   KASSERT(ident != NULL || ISSET(priority, PCATCH) || timo != 0);
rw_assert_anylock(rwl);
status = rw_status(rwl);
 
@@ -351,8 +354,6 @@ sleep_setup(struct sleep_state *sls, con
 #ifdef DIAGNOSTIC
if (p->p_flag & P_CANTSLEEP)
panic("sleep: %s failed insomnia", p->p_p->ps_comm);
-   if (ident == NULL)
-   panic("tsleep: no ident");
if (p->p_stat != SONPROC)
panic("tsleep: not SONPROC");
 #endif
@@ -378,11 +379,23 @@ sleep_setup(struct sleep_state *sls, con
 
TRACEPOINT(sched, sleep, NULL);
 
-   p->p_wchan = ident;
+   /*
+* If ident is NULL the caller does not want to receive
+* 

Re: sdmmc(4): sdmmc_io_function_enable(): don't sleep on lbolt

2020-12-16 Thread Scott Cheloha
> On Dec 16, 2020, at 18:40, Martin Pieuchot  wrote:
> 
> On 16/12/20(Wed) 23:23, Claudio Jeker wrote:
>>> On Wed, Dec 16, 2020 at 04:50:42PM -0300, Martin Pieuchot wrote:
>>> [...] 
>>> Why did we choose to use a variable over NULL?  Any technical reason?
>> 
>> The sleep subsytem requires a non-NULL value for ident. Changing this
>> seems not trivial.
> 
> I'd say this is an implementation detail, nothing prevent us to use a
> "private" ident value if NULL is passed to tsleep(9) :)
> 
>>> I'm wondering it the locality of the variable might not matter in a
>>> distant future.  Did you dig a bit deeper about the FreeBSD solution?
>>> Why did they choose a per-CPU value?
>> 
>> Currently all sleep channels are hashed into IIRC 128 buckets. If all
>> timeouts use the same sleep channel then this queue may get overcrowded.
>> I guess only instrumentation and measurements will tell us how bad the
>> sleep queue is hashed.
> 
> So using a global as sleep channel is not optimum?  Would it be better
> to use an address on the stack?  If so we could make sleep_setup() accept
> NULL and use 'sls' for example.

Yes, I think that scheme would dodge any issues with overuse of a
single global channel.  Plus, "tsleep_nsec(NULL, ...)" looks about
right.  Passing NULL as the wakeup channel to say "I don't want any
wakeup(9) broadcasts" is intuitive.  More intuitive than handrolling a
private channel on the stack or (as clever as I think it is) "deadchan".



Re: sdmmc(4): sdmmc_io_function_enable(): don't sleep on lbolt

2020-12-16 Thread Scott Cheloha
On Tue, Dec 15, 2020 at 01:47:24PM +0100, Mark Kettenis wrote:
> > Date: Tue, 15 Dec 2020 13:32:22 +0100
> > From: Claudio Jeker 
> > 
> > On Fri, Dec 11, 2020 at 07:07:56PM -0600, Scott Cheloha wrote:
> > > Hi,
> > > 
> > > I'd like to remove lbolt from the kernel.  I think having it in the
> > > kernel complicates otherwise simple code.
> > > 
> > > We can start with sdmmc(4).
> > > 
> > > The goal in sdmmc_io_function_enable() is calling 
> > > sdmmc_io_function_ready()
> > > up to six times and sleep 1 second between each attempt.  Here's rewritten
> > > code that does with without lbolt.
> > > 
> > > ok?
> > > 
> > > Index: sdmmc_io.c
> > > ===
> > > RCS file: /cvs/src/sys/dev/sdmmc/sdmmc_io.c,v
> > > retrieving revision 1.41
> > > diff -u -p -r1.41 sdmmc_io.c
> > > --- sdmmc_io.c31 Dec 2019 10:05:33 -  1.41
> > > +++ sdmmc_io.c12 Dec 2020 01:04:59 -
> > > @@ -231,8 +231,8 @@ sdmmc_io_function_enable(struct sdmmc_fu
> > >  {
> > >   struct sdmmc_softc *sc = sf->sc;
> > >   struct sdmmc_function *sf0 = sc->sc_fn0;
> > > + int chan, retry = 5;
> > >   u_int8_t rv;
> > > - int retry = 5;
> > >  
> > >   rw_assert_wrlock(>sc_lock);
> > >  
> > > @@ -244,7 +244,7 @@ sdmmc_io_function_enable(struct sdmmc_fu
> > >   sdmmc_io_write_1(sf0, SD_IO_CCCR_FN_ENABLE, rv);
> > >  
> > >   while (!sdmmc_io_function_ready(sf) && retry-- > 0)
> > > - tsleep_nsec(, PPAUSE, "pause", INFSLP);
> > > + tsleep_nsec(, PPAUSE, "pause", SEC_TO_NSEC(1));
> > >   return (retry >= 0) ? 0 : ETIMEDOUT;
> > >  }
> > >  
> > 
> > Why not use  as wait channel instead of adding a new variable
> > chan? Result is the same. Would it make sense to allow NULL as wait
> > channel to make the tsleep not wakeable. At least that could be used in a
> > few places where timeouts are implemented with tsleep and would make the
> > intent more obvious.
> 
> Or have an appropriately named global variable?  Something like "int nowake"?

Something like the attached patch?

I think the idea of a "dead channel" communicates the intent.  Nobody
broadcasts wakeups on the dead channel.  If you don't want to receive
wakeup broadcasts you sleep on the dead channel.  Hence, "deadchan".

Index: kern/kern_synch.c
===
RCS file: /cvs/src/sys/kern/kern_synch.c,v
retrieving revision 1.172
diff -u -p -r1.172 kern_synch.c
--- kern/kern_synch.c   7 Dec 2020 16:55:29 -   1.172
+++ kern/kern_synch.c   16 Dec 2020 18:50:12 -
@@ -87,6 +87,12 @@ sleep_queue_init(void)
TAILQ_INIT([i]);
 }
 
+/*
+ * Threads that do not want to receive wakeup(9) broadcasts should
+ * sleep on deadchan.
+ */
+static int __deadchan;
+int *deadchan = &__deadchan;
 
 /*
  * During autoconfiguration or after a panic, a sleep will simply
Index: sys/systm.h
===
RCS file: /cvs/src/sys/sys/systm.h,v
retrieving revision 1.148
diff -u -p -r1.148 systm.h
--- sys/systm.h 26 Aug 2020 03:29:07 -  1.148
+++ sys/systm.h 16 Dec 2020 18:50:12 -
@@ -107,6 +107,8 @@ extern struct vnode *rootvp;/* vnode eq
 extern dev_t swapdev;  /* swapping device */
 extern struct vnode *swapdev_vp;/* vnode equivalent to above */
 
+extern int *deadchan;  /* dead wakeup(9) channel */
+
 struct proc;
 struct process;
 #define curproc curcpu()->ci_curproc
Index: dev/sdmmc/sdmmc_io.c
===
RCS file: /cvs/src/sys/dev/sdmmc/sdmmc_io.c,v
retrieving revision 1.41
diff -u -p -r1.41 sdmmc_io.c
--- dev/sdmmc/sdmmc_io.c31 Dec 2019 10:05:33 -  1.41
+++ dev/sdmmc/sdmmc_io.c16 Dec 2020 18:50:12 -
@@ -244,7 +244,7 @@ sdmmc_io_function_enable(struct sdmmc_fu
sdmmc_io_write_1(sf0, SD_IO_CCCR_FN_ENABLE, rv);
 
while (!sdmmc_io_function_ready(sf) && retry-- > 0)
-   tsleep_nsec(, PPAUSE, "pause", INFSLP);
+   tsleep_nsec(deadchan, PPAUSE, "pause", SEC_TO_NSEC(1));
return (retry >= 0) ? 0 : ETIMEDOUT;
 }
 



Re: tht(4): more tsleep(9) -> tsleep_nsec(9) conversions

2020-12-16 Thread Scott Cheloha
On Thu, Dec 03, 2020 at 09:59:11PM -0600, Scott Cheloha wrote:
> Hi,
> 
> tht(4) is another driver still using tsleep(9).
> 
> It uses it to spin while it waits for the card to load the firmware.
> Then it uses it to spin for up to 2 seconds while waiting for
> THT_REG_INIT_STATUS.
> 
> In the firmware case we can sleep for 10 milliseconds each iteration.
> 
> In the THT_REG_INIT_STATUS loop we can sleep for 10 milliseconds each
> iteration again, but instead of using a timeout to set a flag after 2
> seconds we can just count how many milliseconds we've slept.  This is
> less precise than using the timeout but it is much simpler.  Obviously
> we then need to remove all the timeout-related stuff from the function
> and the file.
> 
> Thoughts?  ok?

Two week bump.

Index: if_tht.c
===
RCS file: /cvs/src/sys/dev/pci/if_tht.c,v
retrieving revision 1.142
diff -u -p -r1.142 if_tht.c
--- if_tht.c10 Jul 2020 13:26:38 -  1.142
+++ if_tht.c4 Dec 2020 03:57:21 -
@@ -582,7 +582,6 @@ voidtht_lladdr_read(struct 
tht_softc 
 void   tht_lladdr_write(struct tht_softc *);
 inttht_sw_reset(struct tht_softc *);
 inttht_fw_load(struct tht_softc *);
-void   tht_fw_tick(void *arg);
 void   tht_link_state(struct tht_softc *);
 
 /* interface operations */
@@ -1667,11 +1666,9 @@ tht_sw_reset(struct tht_softc *sc)
 int
 tht_fw_load(struct tht_softc *sc)
 {
-   struct timeout  ticker;
-   volatile intok = 1;
u_int8_t*fw, *buf;
size_t  fwlen, wrlen;
-   int error = 1;
+   int error = 1, msecs, ret;
 
if (loadfirmware("tht", , ) != 0)
return (1);
@@ -1682,7 +1679,9 @@ tht_fw_load(struct tht_softc *sc)
buf = fw;
while (fwlen > 0) {
while (tht_fifo_writable(sc, >sc_txt) <= THT_FIFO_GAP) {
-   if (tsleep(sc, PCATCH, "thtfw", 1) == EINTR)
+   ret = tsleep_nsec(sc, PCATCH, "thtfw",
+   MSEC_TO_NSEC(10));
+   if (ret == EINTR)
goto err;
}
 
@@ -1695,32 +1694,21 @@ tht_fw_load(struct tht_softc *sc)
buf += wrlen;
}
 
-   timeout_set(, tht_fw_tick, (void *));
-   timeout_add_sec(, 2);
-   while (ok) {
+   for (msecs = 0; msecs < 2000; msecs += 10) {
if (tht_read(sc, THT_REG_INIT_STATUS) != 0) {
error = 0;
break;
}
-
-   if (tsleep(sc, PCATCH, "thtinit", 1) == EINTR)
+   ret = tsleep_nsec(sc, PCATCH, "thtinit", MSEC_TO_NSEC(10));
+   if (ret == EINTR)
goto err;
}
-   timeout_del();
 
tht_write(sc, THT_REG_INIT_SEMAPHORE, 0x1);
 
 err:
free(fw, M_DEVBUF, fwlen);
return (error);
-}
-
-void
-tht_fw_tick(void *arg)
-{
-   volatile int*ok = arg;
-
-   *ok = 0;
 }
 
 void



Re: sdmmc(4): sdmmc_io_function_enable(): don't sleep on lbolt

2020-12-15 Thread Scott Cheloha
On Tue, Dec 15, 2020 at 01:47:24PM +0100, Mark Kettenis wrote:
> > Date: Tue, 15 Dec 2020 13:32:22 +0100
> > From: Claudio Jeker 
> > 
> > On Fri, Dec 11, 2020 at 07:07:56PM -0600, Scott Cheloha wrote:
> > > Hi,
> > > 
> > > I'd like to remove lbolt from the kernel.  I think having it in the
> > > kernel complicates otherwise simple code.
> > > 
> > > We can start with sdmmc(4).
> > > 
> > > The goal in sdmmc_io_function_enable() is calling 
> > > sdmmc_io_function_ready()
> > > up to six times and sleep 1 second between each attempt.  Here's rewritten
> > > code that does with without lbolt.
> > > 
> > > ok?
> > > 
> > > Index: sdmmc_io.c
> > > ===
> > > RCS file: /cvs/src/sys/dev/sdmmc/sdmmc_io.c,v
> > > retrieving revision 1.41
> > > diff -u -p -r1.41 sdmmc_io.c
> > > --- sdmmc_io.c31 Dec 2019 10:05:33 -  1.41
> > > +++ sdmmc_io.c12 Dec 2020 01:04:59 -
> > > @@ -231,8 +231,8 @@ sdmmc_io_function_enable(struct sdmmc_fu
> > >  {
> > >   struct sdmmc_softc *sc = sf->sc;
> > >   struct sdmmc_function *sf0 = sc->sc_fn0;
> > > + int chan, retry = 5;
> > >   u_int8_t rv;
> > > - int retry = 5;
> > >  
> > >   rw_assert_wrlock(>sc_lock);
> > >  
> > > @@ -244,7 +244,7 @@ sdmmc_io_function_enable(struct sdmmc_fu
> > >   sdmmc_io_write_1(sf0, SD_IO_CCCR_FN_ENABLE, rv);
> > >  
> > >   while (!sdmmc_io_function_ready(sf) && retry-- > 0)
> > > - tsleep_nsec(, PPAUSE, "pause", INFSLP);
> > > + tsleep_nsec(, PPAUSE, "pause", SEC_TO_NSEC(1));
> > >   return (retry >= 0) ? 0 : ETIMEDOUT;
> > >  }
> > >  
> > 
> > Why not use  as wait channel instead of adding a new variable
> > chan? Result is the same. Would it make sense to allow NULL as wait
> > channel to make the tsleep not wakeable. At least that could be used in a
> > few places where timeouts are implemented with tsleep and would make the
> > intent more obvious.
> 
> Or have an appropriately named global variable?  Something like "int nowake"?

I like this.  Brief aside into other BSDs:

--

FreeBSD and NetBSD call this operation a "pause" instead of a "sleep".
The idea is that a sleeping thread can be woken up with e.g.
wakeup(9) but that a paused thread cannot be awoken in this way.
Paused threads can still be interrupted with a signal.

NetBSD has kpause(9):

https://man.netbsd.org/kpause.9

FreeBSD has a whole bunch of pause interfaces:

https://www.freebsd.org/cgi/man.cgi?query=pause=9=FreeBSD+12.2-RELEASE+and+Ports

It kind-of sounds like what we want.  From that page:

> The pause() function is a wrapper around tsleep() that suspends
> execution of the current thread for the indicated timeout.  The
> thread can not be awakened early by signals or calls to wakeup(),
> wakeup_one() or wakeup_any().  The pause_sig() function is a variant
> of pause() which can > be awakened early by signals. 

FreeBSD implements it with a special per-CPU pause channel.
Look at FreeBSD's _sleep():

https://github.com/freebsd/freebsd/blob/d551da60d42039156f003de6644e9e147ed167a3/sys/kern/kern_synch.c#L173

--

So with that in mind, my thought is to start with a global "int pause"
channel that we all collectively agree not to pass to wakeup(9).  We
can advance the concept more if need be.

I'm happy to fuss with the name.

int pause_chan?



syncer_thread: sleep without lbolt

2020-12-12 Thread Scott Cheloha
Hi,

The syncer thread is one of the last users of the lbolt (lightning
bolt!) sleep channel.

If we add a syncer-specific sleep channel (syncer_chan) and do a bit
of time math we can replicate the current behavior and remove another
lbolt user.

This isn't a perfect recreation of the current behavior.  In this
version the sleep period will drift if processing takes longer than 1
second.  I think it's good enough.  If people are concerned about a
perfect recreation of the current behavior we *can* do it, but it will
require more code.  I don't think it's worth it.

This also fixes two problems in the current code.  They aren't huge
bugs, but they did jump out as potential problems because they make
the syncer's behavior less deterministic:

- The current code uses gettime(9), which will jump and screw up your
  measurement if someone calls settimeofday(2).  The new code uses the
  uptime clock, which is monotonic and stable.

- The current code uses gettime(9), which has a resolution of 1
  second.  Measuring a 1 second timeout with an interface with
  a resolution of 1 second is crude and error-prone.  The new code
  uses getnsecuptime(), which has a resolution of roughly 1/hz.
  Much better.

I vaguely recall beck@ trying to do something with this in the recent
past, so CC beck@.

Thoughts?  ok?

Index: vfs_sync.c
===
RCS file: /cvs/src/sys/kern/vfs_sync.c,v
retrieving revision 1.64
diff -u -p -r1.64 vfs_sync.c
--- vfs_sync.c  24 Jun 2020 22:03:41 -  1.64
+++ vfs_sync.c  12 Dec 2020 19:29:11 -
@@ -48,6 +48,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -73,6 +74,7 @@ LIST_HEAD(synclist, vnode);
 static struct synclist *syncer_workitem_pending;
 
 struct proc *syncerproc;
+int syncer_chan;
 
 /*
  * The workitem queue.
@@ -129,6 +131,15 @@ vn_syncer_add_to_worklist(struct vnode *
splx(s);
 }
 
+uint64_t
+getnsecuptime(void)
+{
+   struct timespec now;
+
+   getnanouptime();
+   return TIMESPEC_TO_NSEC();
+}
+
 /*
  * System filesystem synchronizer daemon.
  */
@@ -138,11 +149,11 @@ syncer_thread(void *arg)
struct proc *p = curproc;
struct synclist *slp;
struct vnode *vp;
-   time_t starttime;
+   uint64_t elapsed, start;
int s;
 
for (;;) {
-   starttime = gettime();
+   start = getnsecuptime();
 
/*
 * Push files whose dirty time has expired.
@@ -220,6 +231,7 @@ syncer_thread(void *arg)
rushjob -= 1;
continue;
}
+
/*
 * If it has taken us less than a second to process the
 * current work, then wait. Otherwise start right over
@@ -228,8 +240,11 @@ syncer_thread(void *arg)
 * matter as we are just trying to generally pace the
 * filesystem activity.
 */
-   if (gettime() == starttime)
-   tsleep_nsec(, PPAUSE, "syncer", INFSLP);
+   elapsed = getnsecuptime() - start;
+   if (elapsed < SEC_TO_NSEC(1)) {
+   tsleep_nsec(_chan, PPAUSE, "syncer",
+   SEC_TO_NSEC(1) - elapsed);
+   }
}
 }
 
@@ -242,7 +257,7 @@ int
 speedup_syncer(void)
 {
if (syncerproc)
-   wakeup_proc(syncerproc, );
+   wakeup_proc(syncerproc, _chan);
if (rushjob < syncdelay / 2) {
rushjob += 1;
stat_rush_requests += 1;



i386: apm(4): apm_thread(): sleep without lbolt

2020-12-11 Thread Scott Cheloha
Here's another sleep that doesn't need lbolt.

The idea here is to call apm_periodic_check() once a second.
We can do that without lbolt.

Is there some other address that would be more appropriate for this
thread to sleep on?  It doesn't look like any apm(4) code calls
wakeup(9) on lbolt so I've just replaced with with a local channel.

ok?

Index: apm.c
===
RCS file: /cvs/src/sys/arch/i386/i386/apm.c,v
retrieving revision 1.125
diff -u -p -r1.125 apm.c
--- apm.c   24 Jun 2020 22:03:40 -  1.125
+++ apm.c   12 Dec 2020 01:17:38 -
@@ -50,6 +50,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -904,12 +905,13 @@ void
 apm_thread(void *v)
 {
struct apm_softc *sc = v;
+   int chan;
 
for (;;) {
rw_enter_write(>sc_lock);
(void) apm_periodic_check(sc);
rw_exit_write(>sc_lock);
-   tsleep_nsec(, PWAIT, "apmev", INFSLP);
+   tsleep_nsec(, PWAIT, "apmev", SEC_TO_NSEC(1));
}
 }
 



sdmmc(4): sdmmc_io_function_enable(): don't sleep on lbolt

2020-12-11 Thread Scott Cheloha
Hi,

I'd like to remove lbolt from the kernel.  I think having it in the
kernel complicates otherwise simple code.

We can start with sdmmc(4).

The goal in sdmmc_io_function_enable() is calling sdmmc_io_function_ready()
up to six times and sleep 1 second between each attempt.  Here's rewritten
code that does with without lbolt.

ok?

Index: sdmmc_io.c
===
RCS file: /cvs/src/sys/dev/sdmmc/sdmmc_io.c,v
retrieving revision 1.41
diff -u -p -r1.41 sdmmc_io.c
--- sdmmc_io.c  31 Dec 2019 10:05:33 -  1.41
+++ sdmmc_io.c  12 Dec 2020 01:04:59 -
@@ -231,8 +231,8 @@ sdmmc_io_function_enable(struct sdmmc_fu
 {
struct sdmmc_softc *sc = sf->sc;
struct sdmmc_function *sf0 = sc->sc_fn0;
+   int chan, retry = 5;
u_int8_t rv;
-   int retry = 5;
 
rw_assert_wrlock(>sc_lock);
 
@@ -244,7 +244,7 @@ sdmmc_io_function_enable(struct sdmmc_fu
sdmmc_io_write_1(sf0, SD_IO_CCCR_FN_ENABLE, rv);
 
while (!sdmmc_io_function_ready(sf) && retry-- > 0)
-   tsleep_nsec(, PPAUSE, "pause", INFSLP);
+   tsleep_nsec(, PPAUSE, "pause", SEC_TO_NSEC(1));
return (retry >= 0) ? 0 : ETIMEDOUT;
 }
 



Re: pool(9): remove ticks (attempt 2)

2020-12-11 Thread Scott Cheloha
On Fri, Dec 11, 2020 at 07:52:45PM +0100, Mark Kettenis wrote:
> > Date: Fri, 11 Dec 2020 11:51:54 -0600
> > From: Scott Cheloha 
> > 
> > On Fri, Dec 11, 2020 at 09:49:07AM -0300, Martin Pieuchot wrote:
> > > 
> > > I'm not sure to understand, can't we do:
> > > 
> > >   pool_wait_free = SEC_TO_NSEC(1);
> > >   pool_wait_gc = SEC_TO_NSEC(8);
> > > 
> > [...]
> > 
> > We can do that at runtime but not at compile time.  SEC_TO_NSEC(1)
> > isn't a constant so that won't compile (I just tried).
> > 
> > We _could_ do something like this:
> > 
> > #define POOL_WAIT_FREE  SEC_TO_NSEC(1)
> > 
> > I think the compiler will probably inline the result and elide the
> > overflow check because the input is a constant.  I don't know how to
> > verify this, but my limited understanding of compilers suggests that
> > this is totally possible.
> 
> Yes.  The consequence of that is that the values are no longer
> patchable.  That may not be very important though (I never really use
> that possibility).

What do you mean by "patchable"?  I assume you don't mean the source
code.

(Also, you did not comment on the struct stuff below so I'm proceeding
with the impression there's nothing at issue there.)

> > > One comment below
> > > 
> > > [...]
> > >
> > > > > Index: sys/pool.h
> > > > > ===
> > > > > RCS file: /cvs/src/sys/sys/pool.h,v
> > > > > retrieving revision 1.77
> > > > > diff -u -p -r1.77 pool.h
> > > > > --- sys/pool.h19 Jul 2019 09:03:03 -  1.77
> > > > > +++ sys/pool.h10 Dec 2020 22:08:33 -
> > > > > @@ -201,7 +201,7 @@ struct pool {
> > > > >   u_int   pr_cache_items; /* target list length */
> > > > >   u_int   pr_cache_contention;
> > > > >   u_int   pr_cache_contention_prev;
> > > > > - int pr_cache_tick;  /* time idle list was empty */
> > > > > + uint64_tpr_cache_timestamp; /* when idle list was 
> > > > > empty */
> > > > >   int pr_cache_nout;
> > > 
> > > Do you see a change in of size in the struct?  If so, does moving
> > > `pr_cache_nout' after `pr_cache_ngc' helps?
> > 
> > Right, the structs.  We're changing both struct pool_page_header and
> > struct pool.
> > 
> > For the unmodified (-current) binary on amd64:
> > 
> > $ gdb -q obj/bsd.gdb
> > (gdb) p sizeof(struct pool_page_header)
> > $1 = 112
> > (gdb) p sizeof(struct pool)
> > $2 = 424
> > 
> > For the patched binary on amd64:
> > 
> > (gdb) p sizeof(struct pool_page_header) 
> > $1 = 112
> > (gdb) p sizeof(struct pool)
> > $2 = 432
> > 
> > kettenis@: The pool_page_header does not grow.  That's good, right?
> > 
> > If do as mpi@ suggested and we move pr_cache_nout after
> > pr_cache_ngc...
> > 
> > $ gdb -q obj/bsd.gdb 
> > (gdb) p sizeof(struct pool_page_header)
> > $1 = 112
> > (gdb) p sizeof(struct pool)
> > $2 = 424
> > 
> > Bingo!  Same size.  mpi@, how did you know?
> > 
> > Patch attached with the struct shuffling and the new macros below.
> > 
> > Index: kern/subr_pool.c
> > ===
> > RCS file: /cvs/src/sys/kern/subr_pool.c,v
> > retrieving revision 1.230
> > diff -u -p -r1.230 subr_pool.c
> > --- kern/subr_pool.c24 Jan 2020 06:31:17 -  1.230
> > +++ kern/subr_pool.c11 Dec 2020 17:50:46 -
> > @@ -41,6 +41,7 @@
> >  #include 
> >  #include 
> >  #include 
> > +#include 
> >  #include 
> >  #include 
> >  
> > @@ -148,7 +149,7 @@ struct pool_page_header {
> > caddr_t ph_page;/* this page's address */
> > caddr_t ph_colored; /* page's colored address */
> > unsigned long   ph_magic;
> > -   int ph_tick;
> > +   uint64_tph_timestamp;
> >  };
> >  #define POOL_MAGICBIT (1 << 3) /* keep away from perturbed low bits */
> >  #define POOL_PHPOISON(ph) ISSET((ph)->ph_magic, POOL_MAGICBIT)
> > @@ -266,8 +267,19 @@ void   pool_gc_sched(void *);
> >  struct timeout pool_gc_tick = TIMEOUT_INITIALIZER(pool_gc_sched, NULL);
> >  void   pool_gc_pa

Re: pool(9): remove ticks (attempt 2)

2020-12-11 Thread Scott Cheloha
On Fri, Dec 11, 2020 at 09:49:07AM -0300, Martin Pieuchot wrote:
> On 11/12/20(Fri) 12:52, Mark Kettenis wrote:
> > > Date: Thu, 10 Dec 2020 16:13:22 -0600
> > > From: Scott Cheloha 
> > > 
> > > Hi,
> > > 
> > > We looked at removing the ticks from subr_pool.c a while back but it
> > > got shelved.  That may or may not have been my fault.  I don't
> > > remember.
> > > 
> > > Anyway, I would normally suggest switching to getuptime(9) here, but
> > > getuptime(9) counts in seconds and we're working with a 1 second
> > > timeout in this code (pool_wait_free) so that's too coarse an
> > > interface for this job.
> > > 
> > > The next best thing I could come up with was introducing a coarse
> > > sub-second interface for use in this file, "getnsecuptime()", which
> > > calls getnanouptime(9) and converts the result to a 64-bit count of
> > > nanoseconds.  This is relatively fast (we don't read the underlying
> > > timecounter hardware) and causes a minimal amount of code change (we
> > > can use it inline because it returns an integral value).
> > > 
> > > >From there the changes are simple:
> > > 
> > > - Renames: ph_tick -> ph_timestamp, pr_cache_tick -> pr_cache_timestamp
> > > 
> > > - Call getnsecuptime(9) wherever we read 'ticks'.
> > > 
> > > - Change pool_wait_gc and pool_wait_free to counts of nanoseconds.
> > >   They could be macros, e.g.
> > > 
> > > #define POOL_WAIT_GC  80ULL
> > > 
> > >   but I'll leave that for a second diff to keep things simple.
> > > 
> > > This compiles and I haven't changed any logic so I assume it isn't
> > > broken.
> > > 
> > > We could move getnsecuptime() into kern_tc.c but it isn't used
> > > anywhere else yet so I'm hesitant to do so.
> > > 
> > > Thoughts?
> > 
> > Specifying the timeouts in nanoseconds isn't particularly useful I'd
> > say.  But I see we can't use SEC_TO_NSEC here because of the overflow
> > check...
> 
> I'm not sure to understand, can't we do:
> 
>   pool_wait_free = SEC_TO_NSEC(1);
>   pool_wait_gc = SEC_TO_NSEC(8);
> 
> or are you pointing at something else?

We can do that at runtime but not at compile time.  SEC_TO_NSEC(1)
isn't a constant so that won't compile (I just tried).

We _could_ do something like this:

#define POOL_WAIT_FREE  SEC_TO_NSEC(1)

I think the compiler will probably inline the result and elide the
overflow check because the input is a constant.  I don't know how to
verify this, but my limited understanding of compilers suggests that
this is totally possible.

> One comment below
> 
> [...]
>
> > > Index: sys/pool.h
> > > ===
> > > RCS file: /cvs/src/sys/sys/pool.h,v
> > > retrieving revision 1.77
> > > diff -u -p -r1.77 pool.h
> > > --- sys/pool.h19 Jul 2019 09:03:03 -  1.77
> > > +++ sys/pool.h10 Dec 2020 22:08:33 -
> > > @@ -201,7 +201,7 @@ struct pool {
> > >   u_int   pr_cache_items; /* target list length */
> > >   u_int   pr_cache_contention;
> > >   u_int   pr_cache_contention_prev;
> > > - int pr_cache_tick;  /* time idle list was empty */
> > > + uint64_tpr_cache_timestamp; /* when idle list was empty */
> > >   int pr_cache_nout;
> 
> Do you see a change in of size in the struct?  If so, does moving
> `pr_cache_nout' after `pr_cache_ngc' helps?

Right, the structs.  We're changing both struct pool_page_header and
struct pool.

For the unmodified (-current) binary on amd64:

$ gdb -q obj/bsd.gdb
(gdb) p sizeof(struct pool_page_header)
$1 = 112
(gdb) p sizeof(struct pool)
$2 = 424

For the patched binary on amd64:

(gdb) p sizeof(struct pool_page_header) 
$1 = 112
(gdb) p sizeof(struct pool)
$2 = 432

kettenis@: The pool_page_header does not grow.  That's good, right?

If do as mpi@ suggested and we move pr_cache_nout after
pr_cache_ngc...

$ gdb -q obj/bsd.gdb 
(gdb) p sizeof(struct pool_page_header)
$1 = 112
(gdb) p sizeof(struct pool)
$2 = 424

Bingo!  Same size.  mpi@, how did you know?

Patch attached with the struct shuffling and the new macros below.

Index: kern/subr_pool.c
===
RCS file: /cvs/src/sys/kern/subr_pool.c,v
retrieving revision 1.230
diff -u -p -r1.230 subr_pool.c
--- kern/subr_pool.c24 Jan 2020 06:31:17 -  1.230
+++ kern/subr_pool.c11 Dec 2020 17:50:46 -
@@ -41,6 +41,7 @@
 

cat(1): -n flag: support files with more than INT_MAX lines

2020-12-10 Thread Scott Cheloha
Hi,

If we bump 'line' from an int to an unsigned long long we will
correctly number files with more than INT_MAX lines instead of
wrapping to a negative number.

ok?

Index: cat.c
===
RCS file: /cvs/src/bin/cat/cat.c,v
retrieving revision 1.30
diff -u -p -r1.30 cat.c
--- cat.c   4 Dec 2020 02:25:56 -   1.30
+++ cat.c   11 Dec 2020 04:10:51 -
@@ -132,7 +132,8 @@ cook_args(char **argv)
 void
 cook_buf(FILE *fp, const char *filename)
 {
-   int ch, gobble, line, prev;
+   unsigned long long line;
+   int ch, gobble, prev;
 
line = gobble = 0;
for (prev = '\n'; (ch = getc(fp)) != EOF; prev = ch) {
@@ -147,7 +148,7 @@ cook_buf(FILE *fp, const char *filename)
}
if (nflag) {
if (!bflag || ch != '\n') {
-   (void)fprintf(stdout, "%6d\t", ++line);
+   fprintf(stdout, "%6llu\t", ++line);
if (ferror(stdout))
break;
} else if (eflag) {



Re: ipmi(4): ipmi_poll_thread(): tsleep(9) -> tsleep_nsec(9)

2020-12-10 Thread Scott Cheloha
On Thu, Dec 10, 2020 at 10:00:46AM +0100, Claudio Jeker wrote:
> On Mon, Dec 07, 2020 at 10:54:26PM -0600, Scott Cheloha wrote:
> > Index: ipmi.c
> > ===
> > RCS file: /cvs/src/sys/dev/ipmi.c,v
> > retrieving revision 1.112
> > diff -u -p -r1.112 ipmi.c
> > --- ipmi.c  29 Mar 2020 09:31:10 -  1.112
> > +++ ipmi.c  2 Dec 2020 20:31:57 -
> > @@ -1497,7 +1497,8 @@ ipmi_poll_thread(void *arg)
> > printf("%s: no SDRs IPMI disabled\n", DEVNAME(sc));
> > goto done;
> > }
> > -   while (tsleep(sc, PWAIT, "ipmirun", 1) != EWOULDBLOCK)
> > +   while (tsleep_nsec(sc, PWAIT, "ipmirun",
> > +   MSEC_TO_NSEC(1)) != EWOULDBLOCK)
> > continue;
> > }
> >  
> 
> This idiom of a quick sleep is a bit strange and I would prefer if this is
> rewritten to be a simple tsleep_nsec call without the while loop.
> Since there is no corresponding wakeup call this tsleep can only return
> EWOULDBLOCK there is no way to return any other value (PCATCH is not set
> and nothing will do a wakeup).
> 
> So this could be simply written as:
>   tsleep_nsec(sc, PWAIT, "ipmirun", MSEC_TO_NSEC(1));
> 
> This whole poll thread is just way more complicated then it needs to be.
> Neither current_sensor nor thread->running are needed. I'm not even sure
> if the tsleep itself is needed in that discovery loop. get_sdr() calls
> ipmi_cmd() which does another tsleep to wait for the command.
> 
> This driver seems to just use all the concepts without much thought. I bet
> ipmi_cmd() calls can race against each other.

One thing at a time.

First, remove the loop.  It is unnecessary, as there is no other
thread calling wakeup(9), i.e. tsleep_nsec(9) will always return
EWOULDBLOCK here.

ok?

Index: ipmi.c
===
RCS file: /cvs/src/sys/dev/ipmi.c,v
retrieving revision 1.113
diff -u -p -r1.113 ipmi.c
--- ipmi.c  11 Dec 2020 04:00:33 -  1.113
+++ ipmi.c  11 Dec 2020 04:05:31 -
@@ -1497,9 +1497,7 @@ ipmi_poll_thread(void *arg)
printf("%s: no SDRs IPMI disabled\n", DEVNAME(sc));
goto done;
}
-   while (tsleep_nsec(sc, PWAIT, "ipmirun",
-   MSEC_TO_NSEC(1)) != EWOULDBLOCK)
-   continue;
+   tsleep_nsec(sc, PWAIT, "ipmirun", MSEC_TO_NSEC(1));
}
 
/* initialize sensor list for thread */



pool(9): remove ticks (attempt 2)

2020-12-10 Thread Scott Cheloha
Hi,

We looked at removing the ticks from subr_pool.c a while back but it
got shelved.  That may or may not have been my fault.  I don't
remember.

Anyway, I would normally suggest switching to getuptime(9) here, but
getuptime(9) counts in seconds and we're working with a 1 second
timeout in this code (pool_wait_free) so that's too coarse an
interface for this job.

The next best thing I could come up with was introducing a coarse
sub-second interface for use in this file, "getnsecuptime()", which
calls getnanouptime(9) and converts the result to a 64-bit count of
nanoseconds.  This is relatively fast (we don't read the underlying
timecounter hardware) and causes a minimal amount of code change (we
can use it inline because it returns an integral value).

>From there the changes are simple:

- Renames: ph_tick -> ph_timestamp, pr_cache_tick -> pr_cache_timestamp

- Call getnsecuptime(9) wherever we read 'ticks'.

- Change pool_wait_gc and pool_wait_free to counts of nanoseconds.
  They could be macros, e.g.

#define POOL_WAIT_GC80ULL

  but I'll leave that for a second diff to keep things simple.

This compiles and I haven't changed any logic so I assume it isn't
broken.

We could move getnsecuptime() into kern_tc.c but it isn't used
anywhere else yet so I'm hesitant to do so.

Thoughts?

Index: kern/subr_pool.c
===
RCS file: /cvs/src/sys/kern/subr_pool.c,v
retrieving revision 1.230
diff -u -p -r1.230 subr_pool.c
--- kern/subr_pool.c24 Jan 2020 06:31:17 -  1.230
+++ kern/subr_pool.c10 Dec 2020 22:08:33 -
@@ -41,6 +41,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -148,7 +149,7 @@ struct pool_page_header {
caddr_t ph_page;/* this page's address */
caddr_t ph_colored; /* page's colored address */
unsigned long   ph_magic;
-   int ph_tick;
+   uint64_tph_timestamp;   /* uptime when last modified */
 };
 #define POOL_MAGICBIT (1 << 3) /* keep away from perturbed low bits */
 #define POOL_PHPOISON(ph) ISSET((ph)->ph_magic, POOL_MAGICBIT)
@@ -266,8 +267,18 @@ void   pool_gc_sched(void *);
 struct timeout pool_gc_tick = TIMEOUT_INITIALIZER(pool_gc_sched, NULL);
 void   pool_gc_pages(void *);
 struct task pool_gc_task = TASK_INITIALIZER(pool_gc_pages, NULL);
-int pool_wait_free = 1;
-int pool_wait_gc = 8;
+uint64_t pool_wait_free = 10ULL;   /* nanoseconds */
+uint64_t pool_wait_gc = 80ULL; /* nanoseconds */
+
+/* XXX where do I put this? */
+uint64_t
+getnsecuptime(void)
+{
+   struct timespec now;
+
+   getnanouptime();
+   return TIMESPEC_TO_NSEC();
+}
 
 RBT_PROTOTYPE(phtree, pool_page_header, ph_node, phtree_compare);
 
@@ -797,7 +808,7 @@ pool_put(struct pool *pp, void *v)
/* is it time to free a page? */
if (pp->pr_nidle > pp->pr_maxpages &&
(ph = TAILQ_FIRST(>pr_emptypages)) != NULL &&
-   (ticks - ph->ph_tick) > (hz * pool_wait_free)) {
+   getnsecuptime() - ph->ph_timestamp > pool_wait_free) {
freeph = ph;
pool_p_remove(pp, freeph);
}
@@ -864,7 +875,7 @@ pool_do_put(struct pool *pp, void *v)
 */
pp->pr_nidle++;
 
-   ph->ph_tick = ticks;
+   ph->ph_timestamp = getnsecuptime();
TAILQ_REMOVE(>pr_partpages, ph, ph_entry);
TAILQ_INSERT_TAIL(>pr_emptypages, ph, ph_entry);
pool_update_curpage(pp);
@@ -1566,7 +1577,7 @@ pool_gc_pages(void *null)
/* is it time to free a page? */
if (pp->pr_nidle > pp->pr_minpages &&
(ph = TAILQ_FIRST(>pr_emptypages)) != NULL &&
-   (ticks - ph->ph_tick) > (hz * pool_wait_gc)) {
+   getnsecuptime() - ph->ph_timestamp > pool_wait_gc) {
freeph = ph;
pool_p_remove(pp, freeph);
} else
@@ -1726,7 +1737,7 @@ pool_cache_init(struct pool *pp)
arc4random_buf(pp->pr_cache_magic, sizeof(pp->pr_cache_magic));
TAILQ_INIT(>pr_cache_lists);
pp->pr_cache_nitems = 0;
-   pp->pr_cache_tick = ticks;
+   pp->pr_cache_timestamp = getnsecuptime();
pp->pr_cache_items = 8;
pp->pr_cache_contention = 0;
pp->pr_cache_ngc = 0;
@@ -1829,7 +1840,7 @@ pool_cache_list_free(struct pool *pp, st
 {
pool_list_enter(pp);
if (TAILQ_EMPTY(>pr_cache_lists))
-   pp->pr_cache_tick = ticks;
+   pp->pr_cache_timestamp = getnsecuptime();
 
pp->pr_cache_nitems += POOL_CACHE_ITEM_NITEMS(ci);
TAILQ_INSERT_TAIL(>pr_cache_lists, ci, ci_nextl);
@@ -2006,7 +2017,7 @@ pool_cache_gc(struct pool *pp)
 {
unsigned int contention, delta;
 
-   if ((ticks - pp->pr_cache_tick) > (hz * pool_wait_gc) &&
+  

bpf(4): BIOCGRTIMEOUT, BIOCSRTIMEOUT: protect with bd_mtx

2020-12-10 Thread Scott Cheloha
Hi,

Before converting bpf(4) from using ticks to using real units of time
we need to serialize BIOCGRTIMEOUT and BIOCSRTIMEOUT.  Neither
operation is atomic so we need to use the per-descriptor mutex when
reading or writing the bd_rtout member.

While here we can start annotating the locking for struct members in
bpfdesc.h, too.

ok?

Index: bpf.c
===
RCS file: /cvs/src/sys/net/bpf.c,v
retrieving revision 1.193
diff -u -p -r1.193 bpf.c
--- bpf.c   4 Nov 2020 04:40:13 -   1.193
+++ bpf.c   10 Dec 2020 17:24:43 -
@@ -873,9 +873,11 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t 
break;
}
rtout += tv->tv_usec / tick;
+   mtx_enter(>bd_mtx);
d->bd_rtout = rtout;
if (d->bd_rtout == 0 && tv->tv_usec != 0)
d->bd_rtout = 1;
+   mtx_leave(>bd_mtx);
break;
}
 
@@ -886,8 +888,10 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t 
{
struct timeval *tv = (struct timeval *)addr;
 
+   mtx_enter(>bd_mtx);
tv->tv_sec = d->bd_rtout / hz;
tv->tv_usec = (d->bd_rtout % hz) * tick;
+   mtx_leave(>bd_mtx);
break;
}
 
Index: bpfdesc.h
===
RCS file: /cvs/src/sys/net/bpfdesc.h,v
retrieving revision 1.41
diff -u -p -r1.41 bpfdesc.h
--- bpfdesc.h   13 May 2020 21:34:37 -  1.41
+++ bpfdesc.h   10 Dec 2020 17:24:43 -
@@ -42,6 +42,13 @@
 
 #ifdef _KERNEL
 
+/*
+ * Locks used to protect struct members in this file:
+ *
+ * I   immutable after initialization
+ * m   the per-descriptor mutex (bpf_d.bd_mtx)
+ */
+
 struct bpf_program_smr {
struct bpf_program  bps_bf;
struct smr_entrybps_smr;
@@ -72,7 +79,7 @@ struct bpf_d {
int bd_in_uiomove;  /* for debugging purpose */
 
struct bpf_if  *bd_bif; /* interface descriptor */
-   u_long  bd_rtout;   /* Read timeout in 'ticks' */
+   u_long  bd_rtout;   /* [m] Read timeout in 'ticks' */
u_long  bd_rdStart; /* when the read started */
int bd_rnonblock;   /* true if nonblocking reads are set */
struct bpf_program_smr



Re: ipmi(4): ipmi_poll_thread(): tsleep(9) -> tsleep_nsec(9)

2020-12-07 Thread Scott Cheloha
On Wed, Dec 02, 2020 at 11:43:32PM +0100, Mark Kettenis wrote:
> > From: "Constantine A. Murenin" 
> > Date: Wed, 2 Dec 2020 14:04:52 -0800
> > 
> > Not sure if you've seen it, but ipmi(4) has been disabled for over 12
> > years, because it's broken on some machines, so, this code is not
> > necessarily guaranteed to be correct as-is.
> > 
> > http://cvsweb.openbsd.org/cgi-bin/cvsweb/src/sys/arch/i386/conf/GENERIC#rev1.632
> > http://cvsweb.openbsd.org/cgi-bin/cvsweb/src/sys/arch/amd64/conf/GENERIC#rev1.238
> 
> The driver is actually enabled on arm64.  And I'll probably enable it
> on powerpc64 at some point.

kettenis@/sthen@:

In that case, could one or both of you test this diff?

I doubt anyone remembers why we spin until tsleep(9) returns
EWOULDBLOCK.  If we can confirm that the driver still works with a 1ms
block in this spot then that's good enough for me.

So, does this still work?

Index: ipmi.c
===
RCS file: /cvs/src/sys/dev/ipmi.c,v
retrieving revision 1.112
diff -u -p -r1.112 ipmi.c
--- ipmi.c  29 Mar 2020 09:31:10 -  1.112
+++ ipmi.c  2 Dec 2020 20:31:57 -
@@ -1497,7 +1497,8 @@ ipmi_poll_thread(void *arg)
printf("%s: no SDRs IPMI disabled\n", DEVNAME(sc));
goto done;
}
-   while (tsleep(sc, PWAIT, "ipmirun", 1) != EWOULDBLOCK)
+   while (tsleep_nsec(sc, PWAIT, "ipmirun",
+   MSEC_TO_NSEC(1)) != EWOULDBLOCK)
continue;
}
 



Re: srp_finalize(9): tsleep(9) -> tsleep_nsec(9)

2020-12-04 Thread Scott Cheloha
On Fri, Dec 04, 2020 at 09:56:02AM +0100, Claudio Jeker wrote:
> On Thu, Dec 03, 2020 at 10:05:30PM -0600, Scott Cheloha wrote:
> > Hi,
> > 
> > srp_finalize(9) uses tsleep(9) to spin while it waits for the object's
> > refcount to reach zero.  It blocks for up to 1 tick and then checks
> > the refecount again and again.
> > 
> > We can just as easily do this with tsleep_nsec(9) and block for 1
> > millisecond per interval.
> > 
> > ok?
> > 
> > Index: kern_srp.c
> > ===
> > RCS file: /cvs/src/sys/kern/kern_srp.c,v
> > retrieving revision 1.12
> > diff -u -p -r1.12 kern_srp.c
> > --- kern_srp.c  8 Sep 2017 05:36:53 -   1.12
> > +++ kern_srp.c  4 Dec 2020 04:04:39 -
> > @@ -274,7 +274,7 @@ void
> >  srp_finalize(void *v, const char *wmesg)
> >  {
> > while (srp_referenced(v))
> > -   tsleep(v, PWAIT, wmesg, 1);
> > +   tsleep_nsec(v, PWAIT, wmesg, MSEC_TO_NSEC(1));
> >  }
> >  
> >  #else /* MULTIPROCESSOR */
> > 
> 
> Why only 1ms instead of the original 10ms (at least on most archs)?

The underlying implementation can only process timeouts from
hardclock(9) which runs about hz times per second.  If we tell the
thread to "sleep for 10ms" it's almost always going to overshoot the
next hardclock(9) and wind up sleeping ~20ms.

Some people run with HZ=1000 kernels.  I don't think many people run
with kernels with a higher HZ than that, though.  So I figure a 1ms
sleep is "good enough" for all practical kernels.  On HZ=100 kernels
the thread will oversleep because it doesn't process timeouts often
enough to honor the 1ms request.

Basically I'm trying to pick a reasonable polling interval (not too
fast) that also won't cause the existing default kernel to block for
longer than it already does (~10ms).  The default kernel is HZ=100, so
a 1ms sleep will, in this case, almost always sleep ~10ms per
iteration of this loop.

It's a bit of a chicken-and-egg problem.

Does that make any sense?



Re: mbg(4): tsleep(9) -> tsleep_nsec(9)

2020-12-04 Thread Scott Cheloha
On Fri, Dec 04, 2020 at 10:07:07AM +0100, Claudio Jeker wrote:
> On Thu, Dec 03, 2020 at 10:42:50PM -0600, Scott Cheloha wrote:
> > Hi,
> > 
> > mbg(4) is among the few remaining drivers using tsleep(9).
> > 
> > In a few spots, when the kernel is not cold, the driver will spin for
> > up to 1/10 seconds waiting for the MBG_BUSY flag to go low.
> > 
> > We can approximate this behavior by spinning 10 times and sleeping 10
> > milliseconds each iteration.  10 x 10ms = 100ms = 1/10 seconds.
> > 
> > I can't test this but I was able to compile it on amd64.  It isn't
> > normally built for amd64, though.  Just i386.
> > 
> > I have my doubts that anyone has this card and is able to actually
> > test this diff.
> > 
> > Anybody ok?
> 
> This code needs to wait for around 70us for the card to process the
> command (according to the comment). The cold code sleeps a max of
> 50 * 20us (1ms). I don't see why the regular code should sleep so much
> longer. I would suggest to use a 1ms timeout and loop 10 times. This is a
> magnitude more than enough and most probably only one cycle will be
> needed.
> 
> IIRC someone got a mbg(4) device some time ago apart from mbalmer@

Makes sense to me.  Updated diff attached.

How are we going to find this person?

Index: mbg.c
===
RCS file: /cvs/src/sys/dev/pci/mbg.c,v
retrieving revision 1.31
diff -u -p -r1.31 mbg.c
--- mbg.c   29 Nov 2020 03:17:27 -  1.31
+++ mbg.c   4 Dec 2020 18:07:43 -
@@ -417,12 +417,12 @@ mbg_read_amcc_s5920(struct mbg_softc *sc
 
/* wait for the BUSY flag to go low (approx 70 us on i386) */
timer = 0;
-   tmax = cold ? 50 : hz / 10;
+   tmax = cold ? 50 : 10;
do {
if (cold)
delay(20);
else
-   tsleep(tstamp, 0, "mbg", 1);
+   tsleep_nsec(tstamp, 0, "mbg", MSEC_TO_NSEC(1));
status = bus_space_read_1(sc->sc_iot, sc->sc_ioh,
AMCC_IMB4 + 3);
} while ((status & MBG_BUSY) && timer++ < tmax);
@@ -473,12 +473,12 @@ mbg_read_amcc_s5933(struct mbg_softc *sc
 
/* wait for the BUSY flag to go low (approx 70 us on i386) */
timer = 0;
-   tmax = cold ? 50 : hz / 10;
+   tmax = cold ? 50 : 10;
do {
if (cold)
delay(20);
else
-   tsleep(tstamp, 0, "mbg", 1);
+   tsleep_nsec(tstamp, 0, "mbg", MSEC_TO_NSEC(1));
status = bus_space_read_1(sc->sc_iot, sc->sc_ioh,
AMCC_IMB4 + 3);
} while ((status & MBG_BUSY) && timer++ < tmax);
@@ -525,12 +525,12 @@ mbg_read_asic(struct mbg_softc *sc, int 
 
/* wait for the BUSY flag to go low */
timer = 0;
-   tmax = cold ? 50 : hz / 10;
+   tmax = cold ? 50 : 10;
do {
if (cold)
delay(20);
else
-   tsleep(tstamp, 0, "mbg", 1);
+   tsleep_nsec(tstamp, 0, "mbg", MSEC_TO_NSEC(1));
status = bus_space_read_1(sc->sc_iot, sc->sc_ioh, ASIC_STATUS);
} while ((status & MBG_BUSY) && timer++ < tmax);
 



mbg(4): tsleep(9) -> tsleep_nsec(9)

2020-12-03 Thread Scott Cheloha
Hi,

mbg(4) is among the few remaining drivers using tsleep(9).

In a few spots, when the kernel is not cold, the driver will spin for
up to 1/10 seconds waiting for the MBG_BUSY flag to go low.

We can approximate this behavior by spinning 10 times and sleeping 10
milliseconds each iteration.  10 x 10ms = 100ms = 1/10 seconds.

I can't test this but I was able to compile it on amd64.  It isn't
normally built for amd64, though.  Just i386.

I have my doubts that anyone has this card and is able to actually
test this diff.

Anybody ok?

Index: mbg.c
===
RCS file: /cvs/src/sys/dev/pci/mbg.c,v
retrieving revision 1.31
diff -u -p -r1.31 mbg.c
--- mbg.c   29 Nov 2020 03:17:27 -  1.31
+++ mbg.c   4 Dec 2020 04:39:59 -
@@ -417,12 +417,12 @@ mbg_read_amcc_s5920(struct mbg_softc *sc
 
/* wait for the BUSY flag to go low (approx 70 us on i386) */
timer = 0;
-   tmax = cold ? 50 : hz / 10;
+   tmax = cold ? 50 : 10;
do {
if (cold)
delay(20);
else
-   tsleep(tstamp, 0, "mbg", 1);
+   tsleep_nsec(tstamp, 0, "mbg", MSEC_TO_NSEC(10));
status = bus_space_read_1(sc->sc_iot, sc->sc_ioh,
AMCC_IMB4 + 3);
} while ((status & MBG_BUSY) && timer++ < tmax);
@@ -473,12 +473,12 @@ mbg_read_amcc_s5933(struct mbg_softc *sc
 
/* wait for the BUSY flag to go low (approx 70 us on i386) */
timer = 0;
-   tmax = cold ? 50 : hz / 10;
+   tmax = cold ? 50 : 10;
do {
if (cold)
delay(20);
else
-   tsleep(tstamp, 0, "mbg", 1);
+   tsleep_nsec(tstamp, 0, "mbg", MSEC_TO_NSEC(10));
status = bus_space_read_1(sc->sc_iot, sc->sc_ioh,
AMCC_IMB4 + 3);
} while ((status & MBG_BUSY) && timer++ < tmax);
@@ -525,12 +525,12 @@ mbg_read_asic(struct mbg_softc *sc, int 
 
/* wait for the BUSY flag to go low */
timer = 0;
-   tmax = cold ? 50 : hz / 10;
+   tmax = cold ? 50 : 10;
do {
if (cold)
delay(20);
else
-   tsleep(tstamp, 0, "mbg", 1);
+   tsleep_nsec(tstamp, 0, "mbg", MSEC_TO_NSEC(10));
status = bus_space_read_1(sc->sc_iot, sc->sc_ioh, ASIC_STATUS);
} while ((status & MBG_BUSY) && timer++ < tmax);
 



srp_finalize(9): tsleep(9) -> tsleep_nsec(9)

2020-12-03 Thread Scott Cheloha
Hi,

srp_finalize(9) uses tsleep(9) to spin while it waits for the object's
refcount to reach zero.  It blocks for up to 1 tick and then checks
the refecount again and again.

We can just as easily do this with tsleep_nsec(9) and block for 1
millisecond per interval.

ok?

Index: kern_srp.c
===
RCS file: /cvs/src/sys/kern/kern_srp.c,v
retrieving revision 1.12
diff -u -p -r1.12 kern_srp.c
--- kern_srp.c  8 Sep 2017 05:36:53 -   1.12
+++ kern_srp.c  4 Dec 2020 04:04:39 -
@@ -274,7 +274,7 @@ void
 srp_finalize(void *v, const char *wmesg)
 {
while (srp_referenced(v))
-   tsleep(v, PWAIT, wmesg, 1);
+   tsleep_nsec(v, PWAIT, wmesg, MSEC_TO_NSEC(1));
 }
 
 #else /* MULTIPROCESSOR */



tht(4): more tsleep(9) -> tsleep_nsec(9) conversions

2020-12-03 Thread Scott Cheloha
Hi,

tht(4) is another driver still using tsleep(9).

It uses it to spin while it waits for the card to load the firmware.
Then it uses it to spin for up to 2 seconds while waiting for
THT_REG_INIT_STATUS.

In the firmware case we can sleep for 10 milliseconds each iteration.

In the THT_REG_INIT_STATUS loop we can sleep for 10 milliseconds each
iteration again, but instead of using a timeout to set a flag after 2
seconds we can just count how many milliseconds we've slept.  This is
less precise than using the timeout but it is much simpler.  Obviously
we then need to remove all the timeout-related stuff from the function
and the file.

Thoughts?  ok?

Index: if_tht.c
===
RCS file: /cvs/src/sys/dev/pci/if_tht.c,v
retrieving revision 1.142
diff -u -p -r1.142 if_tht.c
--- if_tht.c10 Jul 2020 13:26:38 -  1.142
+++ if_tht.c4 Dec 2020 03:57:21 -
@@ -582,7 +582,6 @@ voidtht_lladdr_read(struct 
tht_softc 
 void   tht_lladdr_write(struct tht_softc *);
 inttht_sw_reset(struct tht_softc *);
 inttht_fw_load(struct tht_softc *);
-void   tht_fw_tick(void *arg);
 void   tht_link_state(struct tht_softc *);
 
 /* interface operations */
@@ -1667,11 +1666,9 @@ tht_sw_reset(struct tht_softc *sc)
 int
 tht_fw_load(struct tht_softc *sc)
 {
-   struct timeout  ticker;
-   volatile intok = 1;
u_int8_t*fw, *buf;
size_t  fwlen, wrlen;
-   int error = 1;
+   int error = 1, msecs, ret;
 
if (loadfirmware("tht", , ) != 0)
return (1);
@@ -1682,7 +1679,9 @@ tht_fw_load(struct tht_softc *sc)
buf = fw;
while (fwlen > 0) {
while (tht_fifo_writable(sc, >sc_txt) <= THT_FIFO_GAP) {
-   if (tsleep(sc, PCATCH, "thtfw", 1) == EINTR)
+   ret = tsleep_nsec(sc, PCATCH, "thtfw",
+   MSEC_TO_NSEC(10));
+   if (ret == EINTR)
goto err;
}
 
@@ -1695,32 +1694,21 @@ tht_fw_load(struct tht_softc *sc)
buf += wrlen;
}
 
-   timeout_set(, tht_fw_tick, (void *));
-   timeout_add_sec(, 2);
-   while (ok) {
+   for (msecs = 0; msecs < 2000; msecs += 10) {
if (tht_read(sc, THT_REG_INIT_STATUS) != 0) {
error = 0;
break;
}
-
-   if (tsleep(sc, PCATCH, "thtinit", 1) == EINTR)
+   ret = tsleep_nsec(sc, PCATCH, "thtinit", MSEC_TO_NSEC(10));
+   if (ret == EINTR)
goto err;
}
-   timeout_del();
 
tht_write(sc, THT_REG_INIT_SEMAPHORE, 0x1);
 
 err:
free(fw, M_DEVBUF, fwlen);
return (error);
-}
-
-void
-tht_fw_tick(void *arg)
-{
-   volatile int*ok = arg;
-
-   *ok = 0;
 }
 
 void



hvn(4): msleep(9) -> msleep_nsec(9)

2020-12-03 Thread Scott Cheloha
Hi,

In hvn_alloc_cmd() we spin within a mutex while the freelist is empty.
Because we're using a mutex there is no way to miss the wakeup(9) from
hvn_free_cmd(), so we don't even need a timeout here.  Instead of
doing msleep(9) for 1 tick repeatedly we can msleep_nsec(9) without a
timeout (INFSLP) repeatedly and poll less in this spot.

Once again I can't test this.  But the principle of protecting the
sleep with a mutex is sound.

ok?

Index: if_hvn.c
===
RCS file: /cvs/src/sys/dev/pv/if_hvn.c,v
retrieving revision 1.43
diff -u -p -r1.43 if_hvn.c
--- if_hvn.c4 Dec 2020 03:22:46 -   1.43
+++ if_hvn.c4 Dec 2020 03:35:25 -
@@ -1127,8 +1127,8 @@ hvn_alloc_cmd(struct hvn_softc *sc)
 
mtx_enter(>sc_cntl_fqlck);
while ((rc = TAILQ_FIRST(>sc_cntl_fq)) == NULL)
-   msleep(>sc_cntl_fq, >sc_cntl_fqlck,
-   PRIBIO, "nvsalloc", 1);
+   msleep_nsec(>sc_cntl_fq, >sc_cntl_fqlck,
+   PRIBIO, "nvsalloc", INFSLP);
TAILQ_REMOVE(>sc_cntl_fq, rc, rc_entry);
mtx_leave(>sc_cntl_fqlck);
return (rc);



softraid(4): more tsleep(9) -> tsleep_nsec(9) conversions

2020-12-03 Thread Scott Cheloha
Hi,

softraid(4) is another driver still using tsleep(9).

softraid(4) uses tsleep(9) in one spot to poll for completion.  It
uses it in two other spots to momentarily yield the CPU to permit a
different thread to make progress.

In all cases the length of the timeout is totally arbitrary.  We're
just spinning or yielding.  The duration doesn't matter much.

Currently we tsleep(9) for up to 1 tick.  Instead let's tsleep_nsec(9)
for 1 millisecond.

ok?

Index: softraid.c
===
RCS file: /cvs/src/sys/dev/softraid.c,v
retrieving revision 1.416
diff -u -p -r1.416 softraid.c
--- softraid.c  15 Oct 2020 00:13:47 -  1.416
+++ softraid.c  4 Dec 2020 03:29:20 -
@@ -3885,7 +3885,7 @@ sr_discipline_shutdown(struct sr_discipl
if (sd->sd_reb_active) {
sd->sd_reb_abort = 1;
while (sd->sd_reb_active)
-   tsleep(sd, PWAIT, "sr_shutdown", 1);
+   tsleep_nsec(sd, PWAIT, "sr_shutdown", MSEC_TO_NSEC(1));
}
 
if (meta_save)
@@ -4765,7 +4765,7 @@ sr_rebuild(struct sr_discipline *sd)
}
/* yield if we didn't sleep */
if (slept == 0)
-   tsleep(sc, PWAIT, "sr_yield", 1);
+   tsleep_nsec(sc, PWAIT, "sr_yield", MSEC_TO_NSEC(1));
 
sr_scsi_wu_put(sd, wu_r);
sr_scsi_wu_put(sd, wu_w);
Index: softraid_raid5.c
===
RCS file: /cvs/src/sys/dev/softraid_raid5.c,v
retrieving revision 1.30
diff -u -p -r1.30 softraid_raid5.c
--- softraid_raid5.c26 Mar 2020 11:28:23 -  1.30
+++ softraid_raid5.c4 Dec 2020 03:29:20 -
@@ -857,8 +857,10 @@ sr_raid5_rebuild(struct sr_discipline *s
tsleep_nsec(wu_w, PRIBIO, "sr_rebuild", INFSLP);
slept = 1;
}
-   if (!slept)
-   tsleep(sd->sd_sc, PWAIT, "sr_yield", 1);
+   if (!slept) {
+   tsleep_nsec(sd->sd_sc, PWAIT, "sr_yield",
+   MSEC_TO_NSEC(1));
+   }
 
sr_scsi_wu_put(sd, wu_r);
sr_scsi_wu_put(sd, wu_w);



ipmi(4): ipmi_poll_thread(): tsleep(9) -> tsleep_nsec(9)

2020-12-02 Thread Scott Cheloha
Hi,

ipmi(4) is one of the few remaining callers of tsleep(9).  I want to
convert it to use tsleep_nsec(9) but I need some clarification on what
the code in question is doing.

In ipmi_poll_thread() we initialize all the sensors in a loop.
Between each get_sdr() call we tsleep(9) for 1 tick.

So, I wonder:

1. Why do we sleep here?  Is get_sdr() slow and we don't want to
   hog the CPU?  We block for a tick to let other stuff run?  Or
   is there some other reason?

2. Why are we spinning until tsleep(9) returns EWOULDBLOCK?  Can
   another thread interrupt the sleep with a wakeup(9)?

marco@ added the tsleep(9) loop in this commit:

http://cvsweb.openbsd.org/src/sys/dev/ipmi.c?rev=1.59=text/x-cvsweb-markup

Any ideas?

My guess is that the attached diff will work fine but if we could
remove the loop while we're here that would simplify the code.

Index: ipmi.c
===
RCS file: /cvs/src/sys/dev/ipmi.c,v
retrieving revision 1.112
diff -u -p -r1.112 ipmi.c
--- ipmi.c  29 Mar 2020 09:31:10 -  1.112
+++ ipmi.c  2 Dec 2020 20:31:57 -
@@ -1497,7 +1497,8 @@ ipmi_poll_thread(void *arg)
printf("%s: no SDRs IPMI disabled\n", DEVNAME(sc));
goto done;
}
-   while (tsleep(sc, PWAIT, "ipmirun", 1) != EWOULDBLOCK)
+   while (tsleep_nsec(sc, PWAIT, "ipmirun",
+   MSEC_TO_NSEC(1)) != EWOULDBLOCK)
continue;
}
 



hvn(4), hyperv(4): tsleep(9) -> tsleep_nsec(9)

2020-12-02 Thread Scott Cheloha
Hi,

I'd like to convert all of the wait loops in hvn(4) and hyperv(4) from
using ticks to using real units of time.

Most of them use tsleep(9), so let's do tsleep(9) -> tsleep_nsec(9)
conversions first.

In every case there is an adjacent delay(9) call we use if the kernel
is cold.  I assume it's okay to give the same interval to tsleep_nsec(9)
as we do to delay(9).

mikeb@: This compiles on amd64 but I can't test it.  Can you verify
that these changes don't break anything?

If so, ok?

Also: The loop in hvn_rndis_cmd() delays for 100 microseconds while
all the other loops in that driver delay for 1000 microseconds.  Just
wanted to make sure this was intentional.  I'm sure it's fine, it just
stuck out to me as a possible typo.

Index: if_hvn.c
===
RCS file: /cvs/src/sys/dev/pv/if_hvn.c,v
retrieving revision 1.42
diff -u -p -r1.42 if_hvn.c
--- if_hvn.c30 Aug 2020 10:36:33 -  1.42
+++ if_hvn.c2 Dec 2020 17:50:42 -
@@ -1048,8 +1048,10 @@ hvn_nvs_cmd(struct hvn_softc *sc, void *
if (rv == EAGAIN) {
if (cold)
delay(1000);
-   else
-   tsleep(cmd, PRIBIO, "nvsout", 1);
+   else {
+   tsleep_nsec(cmd, PRIBIO, "nvsout",
+   USEC_TO_NSEC(1000));
+   }
} else if (rv) {
DPRINTF("%s: NVSP operation %u send error %d\n",
sc->sc_dev.dv_xname, hdr->nvs_type, rv);
@@ -1069,8 +1071,10 @@ hvn_nvs_cmd(struct hvn_softc *sc, void *
do {
if (cold)
delay(1000);
-   else
-   tsleep(sc, PRIBIO | PCATCH, "nvscmd", 1);
+   else {
+   tsleep_nsec(sc, PRIBIO | PCATCH, "nvscmd",
+   USEC_TO_NSEC(1000));
+   }
s = splnet();
hvn_nvs_intr(sc);
splx(s);
@@ -1366,8 +1370,10 @@ hvn_rndis_cmd(struct hvn_softc *sc, stru
if (rv == EAGAIN) {
if (cold)
delay(100);
-   else
-   tsleep(rc, PRIBIO, "rndisout", 1);
+   else {
+   tsleep_nsec(rc, PRIBIO, "rndisout",
+   USEC_TO_NSEC(100));
+   }
} else if (rv) {
DPRINTF("%s: RNDIS operation %u send error %d\n",
sc->sc_dev.dv_xname, hdr->rm_type, rv);
@@ -1388,8 +1394,10 @@ hvn_rndis_cmd(struct hvn_softc *sc, stru
do {
if (cold)
delay(1000);
-   else
-   tsleep(rc, PRIBIO | PCATCH, "rndiscmd", 1);
+   else {
+   tsleep_nsec(rc, PRIBIO | PCATCH, "rndiscmd",
+   USEC_TO_NSEC(1000));
+   }
s = splnet();
hvn_nvs_intr(sc);
splx(s);
Index: hyperv.c
===
RCS file: /cvs/src/sys/dev/pv/hyperv.c,v
retrieving revision 1.46
diff -u -p -r1.46 hyperv.c
--- hyperv.c6 Jul 2020 13:33:09 -   1.46
+++ hyperv.c2 Dec 2020 17:50:42 -
@@ -558,8 +558,10 @@ hv_start(struct hv_softc *sc, struct hv_
s = splnet();
hv_intr();
splx(s);
-   } else
-   tsleep(wchan, PRIBIO, wchan, 1);
+   } else {
+   tsleep_nsec(wchan, PRIBIO, wchan,
+   USEC_TO_NSEC(delays[i]));
+   }
}
if (status != 0) {
printf("%s: posting vmbus message failed with %d\n",
@@ -620,8 +622,10 @@ hv_wait(struct hv_softc *sc, int (*cond)
s = splnet();
hv_intr();
splx(s);
-   } else
-   tsleep(wchan, PRIBIO, wmsg ? wmsg : "hvwait", 1);
+   } else {
+   tsleep_nsec(wchan, PRIBIO, wmsg ? wmsg : "hvwait",
+   USEC_TO_NSEC(1000));
+   }
}
 }
 



Re: stdio: fclose(3), fopen(3): intended locking hierarchy?

2020-12-01 Thread Scott Cheloha
On Mon, Nov 30, 2020 at 08:44:41PM -0800, Philip Guenther wrote:
> On Mon, Nov 30, 2020 at 6:10 PM Scott Cheloha 
> wrote:
> 
> > On Sat, Nov 28, 2020 at 01:41:50PM -0800, Philip Guenther wrote:
> >
> ...
> 
> > > After thinking through states more, #4 isn't safe: _fwalk() can't take
> > > sfp_mutex because it's called from __srefill() which has its FILE locked,
> > > which would reverse the order: two concurrent calls to __srefill() from
> > > line-buffered FILEs could have one in _fwalk() blocking on locking the
> > > other, while the other blocks on the sfp_mutex for _fwalk().
> >
> > This part in __srefill():
> >
> > /*
> >  * Before reading from a line buffered or unbuffered file,
> >  * flush all line buffered output files, per the ANSI C
> >  * standard.
> >  */
> >
> ...
> 
> > Where in the standard(s) is this behavior required?  I'm not even sure
> > how to look for it.
> >
> 
> Pick a version of the C standard and search for "buffered" until you find
> something like this, which is part of 7.19.3p3 in the draft I'm looking at:
> 
><...>  When a stream is line buffered, characters are intended to be
>transmitted to or from the host environment as a block when a new-line
> character is
>encountered. Furthermore, characters are intended to be transmitted as a
> block to the host
>environment when a buffer is filled, when input is requested on an
> unbuffered stream, or
>when input is requested on a line buffered stream that requires the
> transmission of
>characters from the host environment. Support for these characteristics
> is
>implementation-defined, and may be affected via the setbuf and setvbuf
> functions.
> 
> Effectively the same text appears in the POSIX standard in XSH 2.5p6.
> 
> Basically, you're allowed to stop doing that by instead printing out your
> cell-phone number so that everyone who wants to complain that their program
> failed to output its prompt before blocking for input can call and scream
> at you.  :D

If they want that they need to call fflush(3) on the stream the prompt
is printed to if that stream is line- or block-buffered.

Anyway, the key phrase is:

> Furthermore, characters are intended to be transmitted as a block to the host
> environment when a buffer is filled, when input is requested on an unbuffered
> stream, or when input is requested on a line buffered stream that requires the
> transmission of characters from the host environment.

When I read this I get the impression they are only talking about the
FILE being read, not that we should transfer characters as a block for
*every* open FILE when input is requested on any unbuffered or
line-buffered stream.

But I am not certain exactly what is meant by the passage you quoted.
It is pretty vague.

One way around the deadlock in __srefill() is to add a FTRYLOCKFILE()
macro and do that during _fwalk() before acting on each FILE.  If we
can't lock the FILE we just move on to the next one.

That doesn't seem right to me though.

FWIW, musl doesn't flush every file when refilling a line- or
unbuffered FILE.  Then again, maybe they misinterpreted the standard. 



Re: stdio: fclose(3), fopen(3): intended locking hierarchy?

2020-11-30 Thread Scott Cheloha
On Sat, Nov 28, 2020 at 01:41:50PM -0800, Philip Guenther wrote:
> On Fri, Nov 27, 2020 at 10:35 PM Philip Guenther  wrote:
> ...
> 
> > So yeah, maybe it does work to:
> > 1) make __sfp() FLOCKFILE() the allocated FILE before unlocking sfp_mutex
> > 2) update f{,d,mem,un}open() and open_*memstream() to match (1), unlocking
> >in all paths when the FILE is safe to be accessed by _fwalk(), and
> > locking
> >sfp_mutex around the zeroing of _flags.
> > 3) make fclose() and freopen() also lock sfp_mutex around the zeroing of
> > _flags
> >(should add an _frelease() to findfp.c that does this dance for (2) and
> > (3))
> > 4) make _fwalk() take sfp_mutex, and maybe also a FILE* so the setting of
> >__SIGN can be done under the lock?
> >
> 5) decide how/whether to handle adjust the FLOCKFILE placement in the
> > _fwalk()
> >tree: is the testing of the "is line-buffered" flag in lflush() safe
> > without
> >a lock?  Mumble...
> >
> 
> After thinking through states more, #4 isn't safe: _fwalk() can't take
> sfp_mutex because it's called from __srefill() which has its FILE locked,
> which would reverse the order: two concurrent calls to __srefill() from
> line-buffered FILEs could have one in _fwalk() blocking on locking the
> other, while the other blocks on the sfp_mutex for _fwalk().

This part in __srefill():

/*
 * Before reading from a line buffered or unbuffered file,
 * flush all line buffered output files, per the ANSI C
 * standard.
 */
if (fp->_flags & (__SLBF|__SNBF)) {
/* Ignore this file in _fwalk to avoid potential deadlock. */
fp->_flags |= __SIGN;
(void) _fwalk(lflush);
fp->_flags &= ~__SIGN;

/* Now flush this file without locking it. */
if ((fp->_flags & (__SLBF|__SWR)) == (__SLBF|__SWR))
__sflush(fp);
}

seems to confound all sensible locking hierarchies.

You need to lock the FILE you're trying to refill.  Then you need to
check how it is buffered.  If it is buffered in a particular way, this
is supposed to trigger a flush on *all other* line-buffered FILEs?

How can you do that without possible deadlock without first yielding
the lock for the FILE in question?  And then you've got a race?

Where in the standard(s) is this behavior required?  I'm not even sure
how to look for it.

> Hmm, there's currently a loop between two __srefill() calls like that, as
> there's nothing to force visibility of the __SIGN flag between CPUs so they
> could try to lock each other.  Grrr.
> 
> Time to check other BSDs and see if they have a good solution to this...

I'd say so.



Re: stdio: fclose(3), fopen(3): intended locking hierarchy?

2020-11-30 Thread Scott Cheloha
On Fri, Nov 27, 2020 at 10:35:59PM -0800, Philip Guenther wrote:
> On Wed, Nov 25, 2020 at 4:23 PM Scott Cheloha 
> wrote:
> 
> > In stdio, which lock are you supposed to take first?  The global
> > sfp_mutex or the per-FILE lock?
> >
> > In __sfp() we hold sfp_mutex while iterating through the pool (unsure
> > what else to call it) of FILEs.  No two threads can modify the pool at
> > the same time:
> >
> ...
> 
> > Note that we set _flags to 1 to reserve it for the current thread
> > before leaving sfp_mutex.  Note also that we don't take the per-FILE
> > lock before reading each FILE's _flags.
> >
> > Then look at fclose(3):
> >
> ...
> 
> > We check if _flags is zero without any lock.  I'm unsure if this is
> > safe.
> >
> > However, we then clean up under the FILE's lock and set _flags to zero
> > without sfp_mutex.
> >
> > ... that can't be right.
> >
> > So, what to do?  My immediate thought was to export sfp_mutex and
> > enter it before writing _flags (diff attached).  But then the global
> > sfp_mutex is "higher" in the locking hierarchy than the per-FILE lock.
> > That doesn't seem quite right to me.
> >
> > We also modify _flags all over stdio without sfp_mutex, so the rule is
> > inconsistent.
> >
> > Another possibility is to take the per-FILE lock when examining each
> > FILE's _flags during __sfp().  That would be costlier, but then the
> > hierarchy would be reversed.
> >
> > Thoughts?
> >
> 
> Let's say that we're willing to presume that changing _flags from
> one non-zero value to another non-zero value will never result in
> a zero value being visible either on this CPU or another one.  If
> that's not true, then there's more to fix, but let's start with
> that assumption.

Sure.

> Given that, I think the only unsafe item in what you described above
> is the setting of _flags to zero in various places without either
> holding sfp_mutex or using some sort of membar (or atomic op) to
> guarantee all previous changes to the FILE are visible before the
> flags change is visible.
> 
> My reasoning would be that if the setting of _flags from non-zero
> to zero was always the last thing visible, then the code scanning
> the list could be sure that a non-zero flags means no one else has
> any pending writes to the FILE and it can be allocated.  __sfp()'s
> setting _flags to 1 to mark it as allocated is made visible to other
> threads when it unlocks sfp_mutex.
> 
> ...but we don't have those membars/atomic-ops, so it's not currently
> guaranteed that __sfp() can't allocate a FILE which is still being
> updated by a thread that's releasing it.  ;(

We can't use the stuff in sys/atomic.h in userspace?

> If strewing membars makes people twitchy (my eye twitches some),
> then yeah, your proposal to take sfp_mutex when zeroing _file is
> te alternative.  Regarding the hierarchy concern, see below.

Ack.

> None of this fixes _fwalk(), which can invoke the callback on
> partially created FILEs, even if it were to grab sfp_mutex.  I can
> imagine a couple directions for fixing that, from setting __SIGN
> on not-yet-completed FILEs and clearing it at the end, to full-blown
> having __sfp() return a locked FILE and making _fwalk() lock each
> FILE before invoking the callback.  Note that of the three callbacks
> passed to _fwalk(), two end up locking the FILE anyway, so maybe
> this is the right direction anyway.
> 
> So, the lock hierarchy is then...interesting:
> 
>  * if you hold sfp_mutex, you can FLOCKFILE a FILE iff _flags == 0
>  * if _flags != 0, you must lock sfp_mutex before zeroing it and
>FUNLOCKFILE and never touch the FILE again before unlocking
>sfp_mutex.
> 
> Given the assumption at top, I believe that's safe+correct.
> 
> The problem case for _fwalk() is _cleanup(), which currently and
> explicitly 'cheats' by failing to lock FILE...but I suspect that's
> a hold-over from when abort() called atexit() handlers, as it's
> supposed to be async-signal-safe and therefore can't take locks.
> abort() no longer does that: POSIX withdrew it because, well, it
> can't be done safely with an async-signal-safe abort() without
> making lots of stdio functions block all signals, which would lead
> to torches and pitchforks.

Ahhh, that makes sense.

> This is presumably _also_ why _fwalk() doesn't lock sfp_mutex when it
> 'obviously' should, so that's fixable too!  Woot!
> 
> So yeah, maybe it does work to:
> 
> 1) make __sfp() FLOCKFILE() the allocated FILE before unlocking sfp_mutex
> 
> 2) update f{,d,mem,un}open() and open_*memstream() to match (1), unlocking
>in all p

Re: an(4): tsleep(9) -> tsleep_nsec(9)

2020-11-30 Thread Scott Cheloha
On Thu, Nov 26, 2020 at 08:25:48PM +1100, Jonathan Gray wrote:
> On Tue, Nov 24, 2020 at 07:20:46PM -0600, Scott Cheloha wrote:
> > Hi,
> > 
> > Both kettenis@ and mpi@ have mentioned in private that my proposed
> > changes to tsleep_nsec(9) etc. would be nicer if we could just get rid
> > of tsleep(9) etc. entirely.
> > 
> > This is difficult, but I'll try.
> > 
> > Worst case, we thin out the remaining callers.  There are not many
> > left.
> > 
> > --
> > 
> > So, an(4) is one such caller.
> > 
> > In an_wait() we spin for (3 * hz) ticks waiting for CSR_WRITE_2 to
> > return the AN_EV_CMD flag.  There is no code handling a case where
> > this fails to happen.
> > 
> > What we do in practice is very nearly equivalent to spinning for 3
> > seconds waiting for CSR_WRITE_2 to return the AN_EV_CMD flag, so I
> > have converted it to use tsleep_nsec(9).
> > 
> > This compiles on amd64 but I can't test it.
> > 
> > Thoughts?  ok?
> 
> I don't see why the upper bound would have to be so precise.
> 
> Why not just
> 
> for (i = 0; i < 3000; i += 100) {
>   if (CSR_READ_2(sc, AN_EVENT_STAT) & AN_EV_CMD)
>   break;
>   tsleep_nsec(sc, PWAIT, "anatch", MSEC_TO_NSEC(100));
> }

I was just trying to imitate the current behavior as closely as
possible.

If you're fine fudging it like that then I'm fine with it.  Just
beware that that tsleep_nsec(9) can and will oversleep by up to 1/hz
seconds.

Did you intend to increase the poll interval from 10ms to 100ms or is
that a typo?



cat(1): simplify/flatten argument loops

2020-11-30 Thread Scott Cheloha
Hi,

The cook_args() and raw_args() functions in cat(1) are too clever.
They handle multiple special cases in a single big loop with lots of
branches.  It's been like this since at least 1989:

https://svnweb.freebsd.org/csrg/bin/cat/cat.c?view=markup=37179

The goal seems to be to avoid calling cook_buf()/raw_cat() from
multiple places.  I think the result is convoluted.  If we isolated
the special cases and called cook_buf()/raw_cat() from multiple places
the result would be simpler and flatter.

You can break the cleanup in each function into four steps:

1. Pull the no-args case out of the loop and handle it first.  Now we
   don't need to check if (*argv == NULL) in the body of the loop.  One
   fewer assignment to fp/fd, too.

2. In the loop, isolate the (strcmp(*argv, "-") == 0) special case
   from the normal filename case.  Now we don't need to check whether
   we're working with stdin when we clean up at the end of a loop
   iteration.  Setup and cleanup are adjacent, no additional branches
   needed.

3. Pass the file name as an argument to cook_buf() and raw_cat().
   Now we don't need the global 'filename' variable.  Obviously
   this means we don't need to assign it a value, either.

4. Use a for-loop and move argv iteration into the loop header.
   Now we increment argv in a single place in the loop.

Thoughts?

Index: cat.c
===
RCS file: /cvs/src/bin/cat/cat.c,v
retrieving revision 1.27
diff -u -p -r1.27 cat.c
--- cat.c   28 Jun 2019 13:34:58 -  1.27
+++ cat.c   1 Dec 2020 00:24:20 -
@@ -51,12 +51,11 @@ extern char *__progname;
 
 int bflag, eflag, nflag, sflag, tflag, vflag;
 int rval;
-char *filename;
 
 void cook_args(char *argv[]);
-void cook_buf(FILE *);
+void cook_buf(FILE *, const char *);
 void raw_args(char *argv[]);
-void raw_cat(int);
+void raw_cat(int, const char *);
 
 int
 main(int argc, char *argv[])
@@ -110,30 +109,29 @@ cook_args(char **argv)
 {
FILE *fp;
 
-   fp = stdin;
-   filename = "stdin";
-   do {
-   if (*argv) {
-   if (!strcmp(*argv, "-"))
-   fp = stdin;
-   else if ((fp = fopen(*argv, "r")) == NULL) {
-   warn("%s", *argv);
-   rval = 1;
-   ++argv;
-   continue;
-   }
-   filename = *argv++;
-   }
-   cook_buf(fp);
-   if (fp == stdin)
+   if (*argv == NULL) {
+   cook_buf(stdin, "stdin");
+   return;
+   }
+
+   for (; *argv != NULL; argv++) {
+   if (!strcmp(*argv, "-")) {
+   cook_buf(stdin, "stdin");
clearerr(fp);
-   else
-   (void)fclose(fp);
-   } while (*argv);
+   continue;
+   }
+   if ((fp = fopen(*argv, "r")) == NULL) {
+   warn("%s", *argv);
+   rval = 1;
+   continue;
+   }
+   cook_buf(fp, *argv);
+   (void)fclose(fp);
+   }
 }
 
 void
-cook_buf(FILE *fp)
+cook_buf(FILE *fp, const char *filename)
 {
int ch, gobble, line, prev;
 
@@ -200,28 +198,28 @@ raw_args(char **argv)
 {
int fd;
 
-   fd = fileno(stdin);
-   filename = "stdin";
-   do {
-   if (*argv) {
-   if (!strcmp(*argv, "-"))
-   fd = fileno(stdin);
-   else if ((fd = open(*argv, O_RDONLY, 0)) == -1) {
-   warn("%s", *argv);
-   rval = 1;
-   ++argv;
-   continue;
-   }
-   filename = *argv++;
+   if (*argv == NULL) {
+   raw_cat(fileno(stdin), "stdin");
+   return;
+   }
+
+   for(; *argv != NULL; argv++) {
+   if (!strcmp(*argv, "-")) {
+   raw_cat(fileno(stdin), "stdin");
+   continue;
+   }
+   if ((fd = open(*argv, O_RDONLY, 0)) == -1) {
+   warn("%s", *argv);
+   rval = 1;
+   continue;
}
-   raw_cat(fd);
-   if (fd != fileno(stdin))
-   (void)close(fd);
-   } while (*argv);
+   raw_cat(fd, *argv);
+   (void)close(fd);
+   }
 }
 
 void
-raw_cat(int rfd)
+raw_cat(int rfd, const char *filename)
 {
int wfd;
ssize_t nr, nw, off;



stdio: fclose(3), fopen(3): intended locking hierarchy?

2020-11-25 Thread Scott Cheloha
Hey,

In stdio, which lock are you supposed to take first?  The global
sfp_mutex or the per-FILE lock?

In __sfp() we hold sfp_mutex while iterating through the pool (unsure
what else to call it) of FILEs.  No two threads can modify the pool at
the same time:

   111  _MUTEX_LOCK(&__sfp_mutex);
   112  for (g = &__sglue; g != NULL; g = g->next) {
   113  for (fp = g->iobs, n = g->niobs; --n >= 0; fp++)
   114  if (fp->_flags == 0)
   115  goto found;
   116  }
   117  
   118  /* release lock while mallocing */
   119  _MUTEX_UNLOCK(&__sfp_mutex);
   120  if ((g = moreglue(NDYNAMIC)) == NULL)
   121  return (NULL);
   122  _MUTEX_LOCK(&__sfp_mutex);
   123  lastglue->next = g;
   124  lastglue = g;
   125  fp = g->iobs;
   126  found:
   127  fp->_flags = 1; /* reserve this slot; caller sets real 
flags */
   128  _MUTEX_UNLOCK(&__sfp_mutex);

Note that we set _flags to 1 to reserve it for the current thread
before leaving sfp_mutex.  Note also that we don't take the per-FILE
lock before reading each FILE's _flags.

Then look at fclose(3):

39  int
40  fclose(FILE *fp)
41  {
42  int r;
43  
44  if (fp->_flags == 0) {  /* not open! */
45  errno = EBADF;
46  return (EOF);
47  }
48  FLOCKFILE(fp);
49  WCIO_FREE(fp);
50  r = fp->_flags & __SWR ? __sflush(fp) : 0;
51  if (fp->_close != NULL && (*fp->_close)(fp->_cookie) < 0)
52  r = EOF;
53  if (fp->_flags & __SMBF)
54  free((char *)fp->_bf._base);
55  if (HASUB(fp))
56  FREEUB(fp);
57  if (HASLB(fp))
58  FREELB(fp);
59  fp->_r = fp->_w = 0;/* Mess up if reaccessed. */
61  fp->_flags = 0; /* Release this FILE for reuse. */
63  FUNLOCKFILE(fp);
64  return (r);
65  }
66  DEF_STRONG(fclose);

We check if _flags is zero without any lock.  I'm unsure if this is
safe.

However, we then clean up under the FILE's lock and set _flags to zero
without sfp_mutex.

... that can't be right.

So, what to do?  My immediate thought was to export sfp_mutex and
enter it before writing _flags (diff attached).  But then the global
sfp_mutex is "higher" in the locking hierarchy than the per-FILE lock.
That doesn't seem quite right to me.

We also modify _flags all over stdio without sfp_mutex, so the rule is
inconsistent.

Another possibility is to take the per-FILE lock when examining each
FILE's _flags during __sfp().  That would be costlier, but then the
hierarchy would be reversed.

Thoughts?

Index: findfp.c
===
RCS file: /cvs/src/lib/libc/stdio/findfp.c,v
retrieving revision 1.19
diff -u -p -r1.19 findfp.c
--- findfp.c5 Apr 2016 04:29:21 -   1.19
+++ findfp.c26 Nov 2020 00:17:16 -
@@ -42,6 +42,7 @@
 #include "thread_private.h"
 
 int__sdidinit;
+void   *__sfp_mutex;
 
 #defineNDYNAMIC 10 /* add ten more whenever necessary */
 
@@ -56,7 +57,6 @@ static FILE usual[FOPEN_MAX - 3];
 static struct __sfileext usualext[FOPEN_MAX - 3];
 static struct glue uglue = { 0, FOPEN_MAX - 3, usual };
 static struct glue *lastglue = 
-static void *sfp_mutex;
 
 static struct __sfileext __sFext[3];
 FILE __sF[3] = {
@@ -108,7 +108,7 @@ __sfp(void)
if (!__sdidinit)
__sinit();
 
-   _MUTEX_LOCK(_mutex);
+   _MUTEX_LOCK(&__sfp_mutex);
for (g = &__sglue; g != NULL; g = g->next) {
for (fp = g->iobs, n = g->niobs; --n >= 0; fp++)
if (fp->_flags == 0)
@@ -116,16 +116,16 @@ __sfp(void)
}
 
/* release lock while mallocing */
-   _MUTEX_UNLOCK(_mutex);
+   _MUTEX_UNLOCK(&__sfp_mutex);
if ((g = moreglue(NDYNAMIC)) == NULL)
return (NULL);
-   _MUTEX_LOCK(_mutex);
+   _MUTEX_LOCK(&__sfp_mutex);
lastglue->next = g;
lastglue = g;
fp = g->iobs;
 found:
fp->_flags = 1; /* reserve this slot; caller sets real flags */
-   _MUTEX_UNLOCK(_mutex);
+   _MUTEX_UNLOCK(&__sfp_mutex);
fp->_p = NULL;  /* no current pointer */
fp->_w = 0; /* nothing to read or write */
fp->_r = 0;
Index: fclose.c
===
RCS file: /cvs/src/lib/libc/stdio/fclose.c,v
retrieving revision 1.10
diff -u -p -r1.10 fclose.c
--- fclose.c31 Aug 2015 02:53:57 -  1.10
+++ fclose.c26 Nov 2020 00:17:16 -
@@ -57,7 +57,9 @@ fclose(FILE *fp)
if (HASLB(fp))
FREELB(fp);
fp->_r = fp->_w = 0;/* Mess up if 

an(4): tsleep(9) -> tsleep_nsec(9)

2020-11-24 Thread Scott Cheloha
Hi,

Both kettenis@ and mpi@ have mentioned in private that my proposed
changes to tsleep_nsec(9) etc. would be nicer if we could just get rid
of tsleep(9) etc. entirely.

This is difficult, but I'll try.

Worst case, we thin out the remaining callers.  There are not many
left.

--

So, an(4) is one such caller.

In an_wait() we spin for (3 * hz) ticks waiting for CSR_WRITE_2 to
return the AN_EV_CMD flag.  There is no code handling a case where
this fails to happen.

What we do in practice is very nearly equivalent to spinning for 3
seconds waiting for CSR_WRITE_2 to return the AN_EV_CMD flag, so I
have converted it to use tsleep_nsec(9).

This compiles on amd64 but I can't test it.

Thoughts?  ok?

Index: an.c
===
RCS file: /cvs/src/sys/dev/ic/an.c,v
retrieving revision 1.76
diff -u -p -r1.76 an.c
--- an.c10 Jul 2020 13:26:37 -  1.76
+++ an.c25 Nov 2020 01:19:16 -
@@ -678,13 +678,18 @@ an_linkstat_intr(struct an_softc *sc)
 void
 an_wait(struct an_softc *sc)
 {
-   int i;
+   struct timespec now, end;
+
+   nanouptime();
+   end = now;
+   end.tv_sec += 3;/* spin for at most three seconds */
 
CSR_WRITE_2(sc, AN_COMMAND, AN_CMD_NOOP2);
-   for (i = 0; i < 3*hz; i++) {
-   if (CSR_READ_2(sc, AN_EVENT_STAT) & AN_EV_CMD)
+   while ((CSR_READ_2(sc, AN_EVENT_STAT) & AN_EV_CMD) == 0) {
+   nanouptime();
+   if (timespeccmp(, , <=))
break;
-   (void)tsleep(sc, PWAIT, "anatch", 1);
+   tsleep_nsec(sc, PWAIT, "anatch", MSEC_TO_NSEC(10));
}
CSR_WRITE_2(sc, AN_EVENT_ACK, AN_EV_CMD);
 }



setitimer(2): protect per-process ITIMER_REAL state with ps_mtx

2020-10-27 Thread Scott Cheloha
Hi,

The last step before unlocking setitimer(2) and getitimer(2) is
protecting the per-process ITIMER_REAL state with something other
than the kernel lock.

Because the ITIMER_REAL timeout runs at IPL_SOFTCLOCK I think the
per-process mutex ps_mtx is appropriate.

Changing the setitimer() routine itself is trivial.  We just
enter/leave ps_mtx instead of the global itimer_mtx if the timer
in question is ITIMER_REAL.

The realitexpire() routine is trickier.  When setitimer(2) is unlocked
there will be a small race between the point where we leave
timeout_mutex in timeout_run() and where we enter ps_mtx in
realitexpire().  Between these two points a different thread in
setitimer(2) running without the kernel lock will be able to enter
ps_mtx and cancel or reschedule the timeout.  So the moment we enter
ps_mtx during realitexpire() we first need to check if the timer was
cancelled or rescheduled to run in the future.  In either case we
don't want to send SIGALRM to the process, we just want to leave the
mutex and return.

Sending the SIGALRM before/after we decide whether to reschedule the
timeout is inconsequential, so I've moved the prsignal() call to the
end of realitexpire().  I've added a KERNEL_ASSERT_LOCKED() there,
too, as prsignal() still needs the kernel lock.  This is trivial for
now, as all timeouts run with the kernel lock.

I have updated the locking notes in sys/proc.h to reflect these
changes.

ok?

Index: kern/kern_time.c
===
RCS file: /cvs/src/sys/kern/kern_time.c,v
retrieving revision 1.149
diff -u -p -r1.149 kern_time.c
--- kern/kern_time.c25 Oct 2020 01:55:18 -  1.149
+++ kern/kern_time.c27 Oct 2020 15:23:16 -
@@ -535,10 +535,11 @@ setitimer(int which, const struct itimer
TIMEVAL_TO_TIMESPEC(>it_interval, _interval);
}
 
-   if (which != ITIMER_REAL)
-   mtx_enter(_mtx);
-   else
+   if (which == ITIMER_REAL) {
+   mtx_enter(>ps_mtx);
nanouptime();
+   } else
+   mtx_enter(_mtx);
 
if (olditv != NULL)
oldits = *itimer;
@@ -553,7 +554,9 @@ setitimer(int which, const struct itimer
*itimer = its;
}
 
-   if (which != ITIMER_REAL)
+   if (which == ITIMER_REAL)
+   mtx_leave(>ps_mtx);
+   else
mtx_leave(_mtx);
 
if (olditv != NULL) {
@@ -660,21 +663,43 @@ realitexpire(void *arg)
struct timespec cts;
struct process *pr = arg;
struct itimerspec *tp = >ps_timer[ITIMER_REAL];
+   int need_signal = 0;
+
+   mtx_enter(>ps_mtx);
 
-   prsignal(pr, SIGALRM);
+   /*
+* Do nothing if the timer was cancelled or rescheduled while we
+* were entering the mutex.
+*/
+   if (!timespecisset(>it_value) || timeout_pending(>ps_realit_to))
+   goto out;
 
-   /* If it was a one-shot timer we're done. */
+   /* The timer expired.  We need to send the signal. */
+   need_signal = 1;
+
+   /* One-shot timers are not reloaded. */
if (!timespecisset(>it_interval)) {
timespecclear(>it_value);
-   return;
+   goto out;
}
 
-   /* Find the nearest future expiration point and restart the timeout. */
+   /*
+* Find the nearest future expiration point and restart the
+* timer before sending the signal.
+*/
nanouptime();
while (timespeccmp(>it_value, , <=))
timespecadd(>it_value, >it_interval, >it_value);
if ((pr->ps_flags & PS_EXITING) == 0)
timeout_at_ts(>ps_realit_to, >it_value);
+
+out:
+   mtx_leave(>ps_mtx);
+
+   if (need_signal) {
+   KERNEL_ASSERT_LOCKED();
+   prsignal(pr, SIGALRM);
+   }
 }
 
 /*
Index: sys/proc.h
===
RCS file: /cvs/src/sys/sys/proc.h,v
retrieving revision 1.300
diff -u -p -r1.300 proc.h
--- sys/proc.h  16 Sep 2020 08:01:15 -  1.300
+++ sys/proc.h  27 Oct 2020 15:23:16 -
@@ -219,7 +219,7 @@ struct process {
struct  rusage *ps_ru;  /* sum of stats for dead threads. */
struct  tusage ps_tu;   /* accumulated times. */
struct  rusage ps_cru;  /* sum of stats for reaped children */
-   struct  itimerspec ps_timer[3]; /* [K] ITIMER_REAL timer */
+   struct  itimerspec ps_timer[3]; /* [m] ITIMER_REAL timer */
/* [T] ITIMER_{VIRTUAL,PROF} timers */
struct  timeout ps_rucheck_to;  /* [] resource limit check timer */
time_t  ps_nextxcpu;/* when to send next SIGXCPU, */
@@ -273,7 +273,7 @@ struct process {
int ps_refcnt;  /* Number of references. */
 
struct  timespec ps_start;  /* starting uptime. */
-   struct  timeout ps_realit_to; 

Re: Please test: switch select(2) to kqfilters

2020-10-26 Thread Scott Cheloha
On Mon, Oct 12, 2020 at 11:11:36AM +0200, Martin Pieuchot wrote:
> On 09/10/20(Fri) 10:38, Martin Pieuchot wrote:
> > On 02/10/20(Fri) 12:19, Martin Pieuchot wrote:
> > > Diff below modifies the internal implementation of {p,}select(2) to
> > > query kqfilter handlers instead of poll ones.
> > > 
> > > I deliberately left {p,}poll(2) untouched to ease the transition.
> > > 
> > > This diff includes some kqueue refactoring from visa@.  It is built on
> > > top of the changes that went in during the last release cycle notably
> > > EVFILT_EXCEPT and NOTE_OOB.
> > > 
> > > A mid-term goal of this change would be to get rid of the poll handlers
> > > in order to have a single event system in the kernel to maintain and
> > > turn mp-safe.
> > > 
> > > The logic is as follow:
> > > 
> > > - With this change every thread get a "private" kqueue, usable by the
> > >   kernel only, to register events for select(2) and later poll(2).
> > > 
> > > - Events specified via FD_SET(2) are converted to their kqueue equivalent.
> > > 
> > > - kqueue_scan() has been modified to be restartable and work with a given
> > >   kqueue.
> > > 
> > > - At the end of every {p,}select(2) syscall the private kqueue is purged.
> > > 
> > > This version includes a fix for a previously reported regression triggered
> > > by regress/usr.bin/ssh's keyscan test.
> > > 
> > > 
> > > I'd like to get this in early in this release cycle, so please test and
> > > report back :o)
> > 
> > Thanks for all the reports.  Here's an updated version including the
> > following changes:
> > 
> > - Allocate the per-thread kqueue in the first {p,}select(2) syscall to
> >   not waste resources as suggested by anton@
> > 
> > - Keep EWOULDBLOCK handling inside kqueue_scan(), pointed by cheloha@
> > 
> > - Add a comment to better explain why successive kqueue_scan() calls are
> >   always non-blocking
> > 
> > I'm appreciate reviews/oks on the kqueue_scan() refactoring I sent to
> > start shrinking this diff.
> > 
> > Tests are always welcome, especially on non-amd64 architectures.
> 
> Rebased diff on top of -current below:

Misc. nits below.  There is a question and a bug down there, too.

> Index: kern/kern_event.c
> ===
> RCS file: /cvs/src/sys/kern/kern_event.c,v
> retrieving revision 1.143
> diff -u -p -r1.143 kern_event.c
> --- kern/kern_event.c 11 Oct 2020 07:11:59 -  1.143
> +++ kern/kern_event.c 12 Oct 2020 08:56:21 -
> @@ -57,6 +57,7 @@
>  #include 
>  #include 
>  
> +struct   kqueue *kqueue_alloc(struct filedesc *);
>  void kqueue_terminate(struct proc *p, struct kqueue *);
>  void kqueue_free(struct kqueue *);
>  void kqueue_init(void);
> @@ -504,6 +505,27 @@ const struct filterops dead_filtops = {
>   .f_event= filt_dead,
>  };
>  
> +void
> +kqpoll_init(struct proc *p)
> +{
> + if (p->p_kq != NULL)
> + return;
> +
> + p->p_kq = kqueue_alloc(p->p_fd);
> + p->p_kq_serial = arc4random();
> +}
> +
> +void
> +kqpoll_exit(struct proc *p)
> +{
> + if (p->p_kq == NULL)
> + return;
> +
> + kqueue_terminate(p, p->p_kq);
> + kqueue_free(p->p_kq);
> + p->p_kq = NULL;
> +}
> +
>  struct kqueue *
>  kqueue_alloc(struct filedesc *fdp)
>  {
> @@ -567,6 +589,7 @@ sys_kevent(struct proc *p, void *v, regi
>   struct timespec ts;
>   struct timespec *tsp = NULL;
>   int i, n, nerrors, error;
> + int ready, total;

Any reason not to put these on the existing line with the other ints?

>   struct kevent kev[KQ_NEVENTS];
>  
>   if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL)
> @@ -595,9 +618,9 @@ sys_kevent(struct proc *p, void *v, regi
>   kq = fp->f_data;
>   nerrors = 0;
>  
> - while (SCARG(uap, nchanges) > 0) {
> - n = SCARG(uap, nchanges) > KQ_NEVENTS ?
> - KQ_NEVENTS : SCARG(uap, nchanges);
> + while ((n = SCARG(uap, nchanges)) > 0) {
> + if (n > nitems(kev))
> + n = nitems(kev);
>   error = copyin(SCARG(uap, changelist), kev,
>   n * sizeof(struct kevent));
>   if (error)
> @@ -635,11 +658,36 @@ sys_kevent(struct proc *p, void *v, regi
>  
>   kqueue_scan_setup(, kq);
>   FRELE(fp, p);
> - error = kqueue_scan(, SCARG(uap, nevents), SCARG(uap, eventlist),
> - tsp, kev, p, );

Add a newline here to isolate the leading comment for the loop.

> + /*
> +  * Collect as many events as we can.  The timeout on successive
> +  * loops is disabled (kqueue_scan() becomes non-blocking).
> +  */
> + total = 0;
> + error = 0;
> + while ((n = SCARG(uap, nevents) - total) > 0) {
> + if (n > nitems(kev))
> + n = nitems(kev);
> + ready = kqueue_scan(, n, kev, tsp, p, );
> + if (ready == 0)
> + break;
> + error = copyout(kev, SCARG(uap, eventlist) + total,
> +   

calctsru(): simplify code

2020-10-25 Thread Scott Cheloha
calctsru() can be simplified.  Its length makes it look more
complicated than it really is.  We're converting from ticks to
nanoseconds and storing nanoseconds in a timespec:

- Remove the check for zero.  Pointless.

- Convert from ticks to nanoseconds inline.  No intermediate
  variables.

- Use NSEC_TO_TIMSPEC() to abbreviate the nsec -> timespec conversion.

ok?

Index: kern_resource.c
===
RCS file: /cvs/src/sys/kern/kern_resource.c,v
retrieving revision 1.69
diff -u -p -r1.69 kern_resource.c
--- kern_resource.c 25 Sep 2020 20:24:32 -  1.69
+++ kern_resource.c 25 Oct 2020 17:05:57 -
@@ -47,6 +47,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -408,34 +409,12 @@ void
 calctsru(struct tusage *tup, struct timespec *up, struct timespec *sp,
 struct timespec *ip)
 {
-   u_quad_t st, ut, it;
-   int freq;
+   int freq = stathz ? stathz : hz;
 
-   st = tup->tu_sticks;
-   ut = tup->tu_uticks;
-   it = tup->tu_iticks;
-
-   if (st + ut + it == 0) {
-   timespecclear(up);
-   timespecclear(sp);
-   if (ip != NULL)
-   timespecclear(ip);
-   return;
-   }
-
-   freq = stathz ? stathz : hz;
-
-   st = st * 10 / freq;
-   sp->tv_sec = st / 10;
-   sp->tv_nsec = st % 10;
-   ut = ut * 10 / freq;
-   up->tv_sec = ut / 10;
-   up->tv_nsec = ut % 10;
-   if (ip != NULL) {
-   it = it * 10 / freq;
-   ip->tv_sec = it / 10;
-   ip->tv_nsec = it % 10;
-   }
+   NSEC_TO_TIMESPEC(tup->tu_uticks * 10 / freq, up);
+   NSEC_TO_TIMESPEC(tup->tu_sticks * 10 / freq, sp);
+   if (ip != NULL)
+   NSEC_TO_TIMESPEC(tup->tu_iticks * 10 / freq, ip);
 }
 
 void



sys/kernel.h: delete dead externs

2020-10-15 Thread Scott Cheloha
Several of the externs in sys/kernel.h are for variables that don't
exist.  I can't find global declarations for tickfix, tickfixinterval,
tickdelta, or timedelta.

ok to delete these?

Index: sys/kernel.h
===
RCS file: /cvs/src/sys/sys/kernel.h,v
retrieving revision 1.23
diff -u -p -r1.23 kernel.h
--- sys/kernel.h20 May 2020 17:24:17 -  1.23
+++ sys/kernel.h15 Oct 2020 08:09:11 -
@@ -51,13 +51,9 @@ extern int utc_offset;   /* seconds east 
 
 extern int tick;   /* usec per tick (100 / hz) */
 extern int tick_nsec;  /* nsec per tick */
-extern int tickfix;/* periodic tick adj. tick not integral */
-extern int tickfixinterval;/* interval at which to apply adjustment */
 extern int tickadj;/* "standard" clock skew, us./tick */
 extern int ticks;  /* # of hardclock ticks */
 extern int hz; /* system clock's frequency */
 extern int stathz; /* statistics clock's frequency */
 extern int profhz; /* profiling clock's frequency */
 extern int lbolt;  /* once a second sleep address */
-extern int tickdelta;
-extern long timedelta;



Re: _exit(2), execve(2): cancel interval timers MP-safely

2020-10-14 Thread Scott Cheloha
On Wed, Oct 14, 2020 at 08:06:52PM -0500, Scott Cheloha wrote:
> _exit(2) and execve(2) need to obey the locking protocol described in
> proc.h when manipulating the per-process interval timer state.
> 
> While we're here we can also remove the now pointless splclock/splx
> dance from execve(2).
> 
> The easiest way to obey the locking protocol is to reuse the interface
> the syscalls are using: setitimer() in kern_time.c.
> 
> Given that we only want to cancel the timers I wrote a small helper
> function, cancelitimer().  I think it's tidier than putting the
> prototype for setitimer() into sys/time.h and requiring the caller to
> prepare an itimerval struct before calling.
> 
> Compare:
> 
>   struct itimerval itv;
>   timerclear(_value);
>   timerclear(_interval);
>   setitimer(ITIMER_REAL, , NULL);
> 
> with:
> 
>   cancelitimer(ITIMER_REAL);
> 
> ... should I shove the for-loop into the helper function too?  Maybe
> call it "cancel_all_itimers()"?  I have a vague feeling that showing
> the reader that there are multiple timers is a good thing here, but
> then again maybe I'm wrong and nobody cares.
> 
> Preferences?  ok?

Whoops, forgot the kern_time.c part of the diff.

Index: kern/kern_exit.c
===
RCS file: /cvs/src/sys/kern/kern_exit.c,v
retrieving revision 1.188
diff -u -p -r1.188 kern_exit.c
--- kern/kern_exit.c18 Mar 2020 15:48:21 -  1.188
+++ kern/kern_exit.c15 Oct 2020 01:12:50 -
@@ -194,7 +194,11 @@ exit1(struct proc *p, int xexit, int xsi
/* close open files and release open-file table */
fdfree(p);
 
-   timeout_del(>ps_realit_to);
+   /* cancel all interval timers */
+   int i;
+   for (i = 0; i < nitems(pr->ps_timer); i++)
+   cancelitimer(i);
+
timeout_del(>ps_rucheck_to);
 #ifdef SYSVSEM
semexit(pr);
Index: kern/kern_exec.c
===
RCS file: /cvs/src/sys/kern/kern_exec.c,v
retrieving revision 1.217
diff -u -p -r1.217 kern_exec.c
--- kern/kern_exec.c11 Jul 2020 22:59:05 -  1.217
+++ kern/kern_exec.c15 Oct 2020 01:12:50 -
@@ -656,14 +656,9 @@ sys_execve(struct proc *p, void *v, regi
}
 
if (pr->ps_flags & PS_SUGIDEXEC) {
-   int i, s = splclock();
-
-   timeout_del(>ps_realit_to);
-   for (i = 0; i < nitems(pr->ps_timer); i++) {
-   timespecclear(>ps_timer[i].it_interval);
-   timespecclear(>ps_timer[i].it_value);
-   }
-   splx(s);
+   int i;
+   for (i = 0; i < nitems(pr->ps_timer); i++)
+   cancelitimer(i);
}
 
/* reset CPU time usage for the thread, but not the process */
Index: kern/kern_time.c
===
RCS file: /cvs/src/sys/kern/kern_time.c,v
retrieving revision 1.146
diff -u -p -r1.146 kern_time.c
--- kern/kern_time.c13 Oct 2020 17:33:39 -  1.146
+++ kern/kern_time.c15 Oct 2020 01:12:50 -
@@ -572,6 +572,16 @@ setitimer(int which, const struct itimer
}
 }
 
+void
+cancelitimer(int which)
+{
+   struct itimerval itv;
+
+   timerclear(_value);
+   timerclear(_interval);
+   setitimer(which, , NULL);
+}
+
 int
 sys_getitimer(struct proc *p, void *v, register_t *retval)
 {
Index: sys/time.h
===
RCS file: /cvs/src/sys/sys/time.h,v
retrieving revision 1.55
diff -u -p -r1.55 time.h
--- sys/time.h  6 Jul 2020 13:33:09 -   1.55
+++ sys/time.h  15 Oct 2020 01:12:50 -
@@ -307,6 +307,7 @@ time_t  getuptime(void);
 struct proc;
 intclock_gettime(struct proc *, clockid_t, struct timespec *);
 
+void   cancelitimer(int);
 intitimerfix(struct timeval *);
 intitimerdecr(struct itimerspec *, long);
 intsettime(const struct timespec *);



_exit(2), execve(2): cancel interval timers MP-safely

2020-10-14 Thread Scott Cheloha
_exit(2) and execve(2) need to obey the locking protocol described in
proc.h when manipulating the per-process interval timer state.

While we're here we can also remove the now pointless splclock/splx
dance from execve(2).

The easiest way to obey the locking protocol is to reuse the interface
the syscalls are using: setitimer() in kern_time.c.

Given that we only want to cancel the timers I wrote a small helper
function, cancelitimer().  I think it's tidier than putting the
prototype for setitimer() into sys/time.h and requiring the caller to
prepare an itimerval struct before calling.

Compare:

struct itimerval itv;
timerclear(_value);
timerclear(_interval);
setitimer(ITIMER_REAL, , NULL);

with:

cancelitimer(ITIMER_REAL);

... should I shove the for-loop into the helper function too?  Maybe
call it "cancel_all_itimers()"?  I have a vague feeling that showing
the reader that there are multiple timers is a good thing here, but
then again maybe I'm wrong and nobody cares.

Preferences?  ok?

Index: kern/kern_exec.c
===
RCS file: /cvs/src/sys/kern/kern_exec.c,v
retrieving revision 1.217
diff -u -p -r1.217 kern_exec.c
--- kern/kern_exec.c11 Jul 2020 22:59:05 -  1.217
+++ kern/kern_exec.c15 Oct 2020 01:02:45 -
@@ -656,14 +656,9 @@ sys_execve(struct proc *p, void *v, regi
}
 
if (pr->ps_flags & PS_SUGIDEXEC) {
-   int i, s = splclock();
-
-   timeout_del(>ps_realit_to);
-   for (i = 0; i < nitems(pr->ps_timer); i++) {
-   timespecclear(>ps_timer[i].it_interval);
-   timespecclear(>ps_timer[i].it_value);
-   }
-   splx(s);
+   int i;
+   for (i = 0; i < nitems(pr->ps_timer); i++)
+   cancelitimer(i);
}
 
/* reset CPU time usage for the thread, but not the process */
Index: kern/kern_exit.c
===
RCS file: /cvs/src/sys/kern/kern_exit.c,v
retrieving revision 1.188
diff -u -p -r1.188 kern_exit.c
--- kern/kern_exit.c18 Mar 2020 15:48:21 -  1.188
+++ kern/kern_exit.c15 Oct 2020 01:02:45 -
@@ -194,7 +194,11 @@ exit1(struct proc *p, int xexit, int xsi
/* close open files and release open-file table */
fdfree(p);
 
-   timeout_del(>ps_realit_to);
+   /* cancel all interval timers */
+   int i;
+   for (i = 0; i < nitems(pr->ps_timer); i++)
+   cancelitimer(i);
+
timeout_del(>ps_rucheck_to);
 #ifdef SYSVSEM
semexit(pr);
Index: sys/time.h
===
RCS file: /cvs/src/sys/sys/time.h,v
retrieving revision 1.55
diff -u -p -r1.55 time.h
--- sys/time.h  6 Jul 2020 13:33:09 -   1.55
+++ sys/time.h  15 Oct 2020 01:02:45 -
@@ -307,6 +307,7 @@ time_t  getuptime(void);
 struct proc;
 intclock_gettime(struct proc *, clockid_t, struct timespec *);
 
+void   cancelitimer(int);
 intitimerfix(struct timeval *);
 intitimerdecr(struct itimerspec *, long);
 intsettime(const struct timespec *);



Re: timeout(9): add clock-based timeouts (attempt 2)

2020-10-09 Thread Scott Cheloha
Hey,

> On Oct 7, 2020, at 8:49 PM, 内藤 祐一郎  wrote:
> 
> Hi.
> 
> I'm looking forward to this patch is committed.
> Because this patch solves my problem about CARP timeout.
> 
> IIJ, a company that I am working for, is using carp(4) on VMware ESXi hosts
> for VPN and web gateway services.
> 
> One is master and the other is backup of carp(4).
> Active host sometimes failover to backup when the ESXi host gets high cpu 
> usage.
> And also CPU ready of OpenBSD machine seems high average on ESXi monitor.
> 
> High CPU ready machine delays sending carp advertisement for 3 or 4 seconds.
> It is enough to failover to backup.
> 
> In my investigation, OpenBSD machine does not always get CPU under high CPU 
> ready condition.
> Although it is needed for interrupt handler.
> The delay of calling hardclock() causes tick count up delay.
> One delay is small but will never be resolved.
> So total delay can reach 3 or 4 seconds while tick counts up to 100.
> The tickless patch can solve the delay.
> 
> I have tried to adapt in_carp.c to the tickless attempt 2.
> Delay of carp advertisement reduced to about 2 seconds.

I'm glad to hear it improves things.  Thanks for testing it out.

>> 2020/09/09 4:00、Mark Kettenis のメール:
>> The diff looks reasonable to me, but I'd like to discuss the path
>> forward with some people during the hackathon next week.
> 
> Is there any discussion in the hackathon?

Not that I heard.  I wasn't at the hackathon, though.

--

If I get an OK from someone I will commit what I have so far.

Where do we stand?

- The nitty gritty details in this commit -- the hashing,
  the loops, and the basic algorithm -- haven't changed
  in almost a year.  I'm confident they work.

- The commit itself doesn't change any behavior because no
  existing timeouts are converted to use timeout_set_kclock().
  So we shouldn't see any regressions like last time until
  someone deliberately changes an existing timeout to use the
  kclock interfaces.

The thing that needs to be decided is how to go about dragging
the rest of the tree into using the kclock timeout interfaces.

- Should we keep a tick-based timeout interface?  If so,
  for how long?  Linux kept theirs as a distinct interface.
  FreeBSD discarded theirs.

- Should we quietly reimplement timeout_add_sec(9), etc.,
  in terms of kclock timeouts or should we do a full-tree
  API change to explicitly use timeout_in_nsec()?

I don't think we can make such decisions without putting kclock
timeouts into the tree so people can use them.

So, are you OK with this as-is?

Anybody else?



Re: setitimer(2): ITIMER_REAL: simplify realitexpire()

2020-10-07 Thread Scott Cheloha
> On Oct 7, 2020, at 15:51, Klemens Nanni  wrote:
> 
> On Wed, Oct 07, 2020 at 03:34:36PM -0500, Scott Cheloha wrote:
>> kn@ wanted to clean it up a while ago but I wan't far enough along
>> with hi-res timeouts to change the code yet.
> That was not me.

I am thinking of this:

https://marc.info/?l=openbsd-tech=156367664207978=2



setitimer(2): ITIMER_REAL: simplify realitexpire()

2020-10-07 Thread Scott Cheloha
Hi,

The code in realitexpire(), the ITIMER_REAL timeout callback, is
needlessly complicated.

kn@ wanted to clean it up a while ago but I wan't far enough along
with hi-res timeouts to change the code yet.

Hi-res timeouts are now imminent, and setitimer(2) will probably be
the first guinnea pig I use to demonstrate that they Actually Work.
This cleanup will make employing a hi-res timeout here in a later diff
a lot simpler.

So, the cleanup:

- No need to call getnanouptime(9) more than once.  timespecadd(3) is
  very fast, and it_interval is at least 1/hz seconds wide, so we expect
  to "catch up" to the uptime after a couple iterations at most.

  When we switch to a hi-res timeout this will be important.  We will
  need to switch to nanouptime(9), which is much more expensive than
  getnanouptime(9).

- The for-loop and indentation here is really peculiar.  Use a while-loop
  to increment it_value and pull the rest of the code out of the loop.

- Collapse intermediate assignments.  tstohz(9) cannot return 0 in
  this case as it_interval is non-zero, so the check for timo < 0 is
  pointless.  With that out of the way, all we want to do is round
  back up to 1 if (tstohz(9) - 1 == 0), which we can do with MAX().

I am leaving the PS_EXITING check in place.  I am *pretty* sure it is
superfluous: realitexpire() runs under the kernel lock, and _exit(2)
runs under the kernel lock, so we aren't racing _exit(2)... but I will
leave it in-place until I can confirm my suspicions with the right
people.

ok?

Index: kern_time.c
===
RCS file: /cvs/src/sys/kern/kern_time.c,v
retrieving revision 1.144
diff -u -p -r1.144 kern_time.c
--- kern_time.c 7 Oct 2020 17:53:44 -   1.144
+++ kern_time.c 7 Oct 2020 20:19:15 -
@@ -644,31 +644,25 @@ sys_setitimer(struct proc *p, void *v, r
 void
 realitexpire(void *arg)
 {
+   struct timespec cts, nts;
struct process *pr = arg;
struct itimerspec *tp = >ps_timer[ITIMER_REAL];
 
prsignal(pr, SIGALRM);
+
+   /* If it was a one-shot timer we're done. */
if (!timespecisset(>it_interval)) {
timespecclear(>it_value);
return;
}
-   for (;;) {
-   struct timespec cts, nts;
-   int timo;
 
+   /* Find the nearest future expiration point and reload the timer. */
+   getnanouptime();
+   while (timespeccmp(>it_value, , <=))
timespecadd(>it_value, >it_interval, >it_value);
-   getnanouptime();
-   if (timespeccmp(>it_value, , >)) {
-   nts = tp->it_value;
-   timespecsub(, , );
-   timo = tstohz() - 1;
-   if (timo <= 0)
-   timo = 1;
-   if ((pr->ps_flags & PS_EXITING) == 0)
-   timeout_add(>ps_realit_to, timo);
-   return;
-   }
-   }
+   timespecsub(>it_value, , );
+   if ((pr->ps_flags & PS_EXITING) == 0)
+   timeout_add(>ps_realit_to, MAX(1, tstohz() - 1));
 }
 
 /*



Re: Please test: switch select(2) to kqfilters

2020-10-04 Thread Scott Cheloha
On Sat, Oct 03, 2020 at 09:09:00AM +0200, Martin Pieuchot wrote:
> On 02/10/20(Fri) 19:09, Scott Cheloha wrote:
> > On Fri, Oct 02, 2020 at 12:19:35PM +0200, Martin Pieuchot wrote:
> > > @@ -635,12 +642,39 @@ sys_kevent(struct proc *p, void *v, regi
> > >   goto done;
> > >   }
> > >  
> > > +
> > >   KQREF(kq);
> > >   FRELE(fp, p);
> > > - error = kqueue_scan(kq, SCARG(uap, nevents), SCARG(uap, eventlist),
> > > - tsp, kev, p, );
> > > + /*
> > > +  * Collect as many events as we can.  The timeout on successive
> > > +  * loops is disabled (kqueue_scan() becomes non-blocking).
> > > +  */
> > > + total = 0;
> > > + error = 0;
> > > + kqueue_scan_setup(, kq);
> > > + while ((n = SCARG(uap, nevents) - total) > 0) {
> > > + if (n > nitems(kev))
> > > + n = nitems(kev);
> > > + ready = kqueue_scan(, n, kev, tsp, p, );
> > > + if (ready == 0)
> > > + break;
> > > + error = copyout(kev, SCARG(uap, eventlist) + total,
> > > + sizeof(struct kevent) * ready);
> > > +#ifdef KTRACE
> > > + if (KTRPOINT(p, KTR_STRUCT))
> > > + ktrevent(p, kev, ready);
> > > +#endif
> > > + total += ready;
> > > + if (error || ready < n)
> > > + break;
> > > + tsp =   /* successive loops non-blocking */
> > > + timespecclear(tsp);
> > 
> > Here, this.  Why do we force a non-blocking loop the second time?
> 
> If there's a second time that implies the first time already reported
> some events so there's already something to return to userland.  In that
> case we just want to gather the events that were not collected the first
> time and not sleep again.

Okay, now I see it, thank you for the explanation.

> > > + }
> > > + kqueue_scan_finish();
> > >   KQRELE(kq);
> > > - *retval = n;
> > > + if (error == EWOULDBLOCK)
> > > + error = 0;
> > > + *retval = total;
> > >   return (error);
> > >  
> > >   done:
> > > @@ -894,24 +928,22 @@ kqueue_sleep(struct kqueue *kq, struct t
> > >   return (error);
> > >  }
> > >  
> > > +/*
> > > + * Scan the kqueue, blocking if necessary until the target time is 
> > > reached.
> > > + * If tsp is NULL we block indefinitely.  If tsp->ts_secs/nsecs are both
> > > + * 0 we do not block at all.
> > > + */
> > >  int
> > > -kqueue_scan(struct kqueue *kq, int maxevents, struct kevent *ulistp,
> > > -struct timespec *tsp, struct kevent *kev, struct proc *p, int 
> > > *retval)
> > > +kqueue_scan(struct kqueue_scan_state *scan, int maxevents,
> > > +struct kevent *kevp, struct timespec *tsp, struct proc *p, int 
> > > *errorp)
> > >  {
> > > - struct kevent *kevp;
> > > - struct knote mend, mstart, *kn;
> > > - int s, count, nkev, error = 0;
> > > -
> > > - nkev = 0;
> > > - kevp = kev;
> > > + struct knote *kn;
> > > + struct kqueue *kq = scan->kqs_kq;
> > > + int s, count, nkev = 0, error = 0;
> > >  
> > >   count = maxevents;
> > >   if (count == 0)
> > >   goto done;
> > > -
> > > - memset(, 0, sizeof(mstart));
> > > - memset(, 0, sizeof(mend));
> > > -
> > >  retry:
> > >   KASSERT(count == maxevents);
> > >   KASSERT(nkev == 0);
> > > @@ -923,7 +955,8 @@ retry:
> > >  
> > >   s = splhigh();
> > >   if (kq->kq_count == 0) {
> > > - if (tsp != NULL && !timespecisset(tsp)) {
> > > + if ((tsp != NULL && !timespecisset(tsp)) ||
> > > + scan->kqs_nevent != 0) {
> > >   splx(s);
> > >   error = 0;
> > >   goto done;
> > > @@ -931,7 +964,7 @@ retry:
> > >   kq->kq_state |= KQ_SLEEP;
> > >   error = kqueue_sleep(kq, tsp);
> > >   splx(s);
> > > - if (error == 0 || error == EWOULDBLOCK)
> > > + if (error == 0)
> > >   goto retry;
> > 
> > Why wouldn't we want to retry in the EWOULDBLOCK case?
> > You have a check for
> > 
> > tsp != NULL && !timespecisset(tsp)
> > 
> > e.g., when you time out

Re: Please test: switch select(2) to kqfilters

2020-10-02 Thread Scott Cheloha
On Fri, Oct 02, 2020 at 12:19:35PM +0200, Martin Pieuchot wrote:
> 
> [...]
> 
> I'd like to get this in early in this release cycle, so please test and
> report back :o)

You removed the resleep logic that accounts for if/when tsleep_nsec(9)
returns early.  So now select and pselect can return too soon.

I've left questions below in the spots I think look off.

> Index: kern/kern_event.c
> ===
> RCS file: /cvs/src/sys/kern/kern_event.c,v
> retrieving revision 1.142
> diff -u -p -r1.142 kern_event.c
> --- kern/kern_event.c 12 Aug 2020 13:49:24 -  1.142
> +++ kern/kern_event.c 1 Oct 2020 12:53:54 -
> @@ -64,9 +64,6 @@ voidKQREF(struct kqueue *);
>  void KQRELE(struct kqueue *);
>  
>  int  kqueue_sleep(struct kqueue *, struct timespec *);
> -int  kqueue_scan(struct kqueue *kq, int maxevents,
> - struct kevent *ulistp, struct timespec *timeout,
> - struct kevent *kev, struct proc *p, int *retval);
>  
>  int  kqueue_read(struct file *, struct uio *, int);
>  int  kqueue_write(struct file *, struct uio *, int);
> @@ -521,6 +518,14 @@ kqueue_alloc(struct filedesc *fdp)
>   return (kq);
>  }
>  
> +void
> +kqueue_exit(struct proc *p)
> +{
> + kqueue_terminate(p, p->p_kq);
> + kqueue_free(p->p_kq);
> + p->p_kq = NULL;
> +}
> +
>  int
>  sys_kqueue(struct proc *p, void *v, register_t *retval)
>  {
> @@ -554,6 +559,7 @@ out:
>  int
>  sys_kevent(struct proc *p, void *v, register_t *retval)
>  {
> + struct kqueue_scan_state scan;
>   struct filedesc* fdp = p->p_fd;
>   struct sys_kevent_args /* {
>   syscallarg(int) fd;
> @@ -569,6 +575,7 @@ sys_kevent(struct proc *p, void *v, regi
>   struct timespec ts;
>   struct timespec *tsp = NULL;
>   int i, n, nerrors, error;
> + int ready, total;
>   struct kevent kev[KQ_NEVENTS];
>  
>   if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL)
> @@ -597,9 +604,9 @@ sys_kevent(struct proc *p, void *v, regi
>   kq = fp->f_data;
>   nerrors = 0;
>  
> - while (SCARG(uap, nchanges) > 0) {
> - n = SCARG(uap, nchanges) > KQ_NEVENTS ?
> - KQ_NEVENTS : SCARG(uap, nchanges);
> + while ((n = SCARG(uap, nchanges)) > 0) {
> + if (n > nitems(kev))
> + n = nitems(kev);
>   error = copyin(SCARG(uap, changelist), kev,
>   n * sizeof(struct kevent));
>   if (error)
> @@ -635,12 +642,39 @@ sys_kevent(struct proc *p, void *v, regi
>   goto done;
>   }
>  
> +
>   KQREF(kq);
>   FRELE(fp, p);
> - error = kqueue_scan(kq, SCARG(uap, nevents), SCARG(uap, eventlist),
> - tsp, kev, p, );
> + /*
> +  * Collect as many events as we can.  The timeout on successive
> +  * loops is disabled (kqueue_scan() becomes non-blocking).
> +  */
> + total = 0;
> + error = 0;
> + kqueue_scan_setup(, kq);
> + while ((n = SCARG(uap, nevents) - total) > 0) {
> + if (n > nitems(kev))
> + n = nitems(kev);
> + ready = kqueue_scan(, n, kev, tsp, p, );
> + if (ready == 0)
> + break;
> + error = copyout(kev, SCARG(uap, eventlist) + total,
> + sizeof(struct kevent) * ready);
> +#ifdef KTRACE
> + if (KTRPOINT(p, KTR_STRUCT))
> + ktrevent(p, kev, ready);
> +#endif
> + total += ready;
> + if (error || ready < n)
> + break;
> + tsp =   /* successive loops non-blocking */
> + timespecclear(tsp);

Here, this.  Why do we force a non-blocking loop the second time?

> + }
> + kqueue_scan_finish();
>   KQRELE(kq);
> - *retval = n;
> + if (error == EWOULDBLOCK)
> + error = 0;
> + *retval = total;
>   return (error);
>  
>   done:
> @@ -894,24 +928,22 @@ kqueue_sleep(struct kqueue *kq, struct t
>   return (error);
>  }
>  
> +/*
> + * Scan the kqueue, blocking if necessary until the target time is reached.
> + * If tsp is NULL we block indefinitely.  If tsp->ts_secs/nsecs are both
> + * 0 we do not block at all.
> + */
>  int
> -kqueue_scan(struct kqueue *kq, int maxevents, struct kevent *ulistp,
> -struct timespec *tsp, struct kevent *kev, struct proc *p, int *retval)
> +kqueue_scan(struct kqueue_scan_state *scan, int maxevents,
> +struct kevent *kevp, struct timespec *tsp, struct proc *p, int *errorp)
>  {
> - struct kevent *kevp;
> - struct knote mend, mstart, *kn;
> - int s, count, nkev, error = 0;
> -
> - nkev = 0;
> - kevp = kev;
> + struct knote *kn;
> + struct kqueue *kq = scan->kqs_kq;
> + int s, count, nkev = 0, error = 0;
>  
>   count = maxevents;
>   if (count == 0)
>   goto done;
> -
> - memset(, 0, sizeof(mstart));
> - memset(, 0, 

Re: getitimer(2), setitimer(2): merge critical sections

2020-09-25 Thread Scott Cheloha
On Tue, Sep 01, 2020 at 10:24:11AM -0500, Scott Cheloha wrote:
> On Mon, Aug 17, 2020 at 05:55:34PM -0500, Scott Cheloha wrote:
> > 
> > [...]
> 
> Two week bump.
> 
> In summary:
> 
> - Merge the critical sections so that "timer swap" with setitimer(2)
>   is atomic.
> 
> - To do this, move error-free operations into a common kernel
>   subroutine, setitimer().  Now we have one critical section.
> 
> - Leave error-prone operations in sys_getitimer() and sys_setitimer().
> 
> In order to make the "timer swap" atomic we leave the timer installed
> if the copyout(9) fails.  This isn't great, but it is unavoidable
> without permitting copyout(9) from within a mutex.  FreeBSD and Linux
> went this direction, too.  I would rather leave the timer running and
> have an atomic swap than race the hardclock(9) or the realitexpire()
> timeout.
> 
> [...]

6 week bump.  Diff attached again.  Basically nothing has changed.
We're moving the critical sections from sys_getitimer() and
sys_setitimer() into a new subroutine, setitimer(), and merging those
critical sections.

I'm confident that merging the critical sections is the right
direction to take setitimer(2).  It gets us closer to removing the
kernel lock and eliminates a race with hardclock(9).

In the near future we will need to call the setitimer() subroutine
during _exit(2) to safely cancel the ITIMER_REAL timeout so I think
putting the critical section into a subroutine we can call from
outside of kern_time.c is also the right thing to do.

I'm going to wait until after the tree is unlocked to commit this, as
the behavior change in the timer swap case *might* break some software
out there that isn't checking the setitimer(2) return code.  It's
unlikely, but it's possible, so I'll wait.

CC: everyone I've ever talked to about setitimer(2).

Index: kern_time.c
===
RCS file: /cvs/src/sys/kern/kern_time.c,v
retrieving revision 1.140
diff -u -p -r1.140 kern_time.c
--- kern_time.c 12 Aug 2020 15:31:27 -  1.140
+++ kern_time.c 25 Sep 2020 23:32:08 -
@@ -491,7 +491,7 @@ out:
 struct mutex itimer_mtx = MUTEX_INITIALIZER(IPL_CLOCK);
 
 /*
- * Get value of an interval timer.  The process virtual and
+ * Get or set value of an interval timer.  The process virtual and
  * profiling virtual time timers are kept internally in the
  * way they are specified externally: in time until they expire.
  *
@@ -509,6 +509,62 @@ struct mutex itimer_mtx = MUTEX_INITIALI
  * real time timers .it_interval.  Rather, we compute the next time in
  * absolute time the timer should go off.
  */
+void
+setitimer(int which, const struct itimerval *itv, struct itimerval *olditv)
+{
+   struct itimerspec its, oldits;
+   struct itimerspec *itimer;
+   struct process *pr;
+   int timo;
+
+   KASSERT(which >= ITIMER_REAL && which <= ITIMER_PROF);
+
+   pr = curproc->p_p;
+   itimer = >ps_timer[which];
+
+   if (itv != NULL) {
+   TIMEVAL_TO_TIMESPEC(>it_value, _value);
+   TIMEVAL_TO_TIMESPEC(>it_interval, _interval);
+   }
+
+   if (which != ITIMER_REAL)
+   mtx_enter(_mtx);
+
+   if (olditv != NULL)
+   oldits = *itimer;
+   if (itv != NULL) {
+   if (which == ITIMER_REAL) {
+   struct timespec cts;
+   getnanouptime();
+   if (timespecisset(_value)) {
+   timo = tstohz(_value);
+   timeout_add(>ps_realit_to, timo);
+   timespecadd(_value, , _value);
+   } else
+   timeout_del(>ps_realit_to);
+   }
+   *itimer = its;
+   }
+
+   if (which != ITIMER_REAL)
+   mtx_leave(_mtx);
+
+   if (olditv != NULL) {
+   if (which == ITIMER_REAL && timespecisset(_value)) {
+   struct timespec now;
+   getnanouptime();
+   if (timespeccmp(_value, , <))
+   timespecclear(_value);
+   else {
+   timespecsub(_value, ,
+   _value);
+   }
+   }
+   TIMESPEC_TO_TIMEVAL(>it_value, _value);
+   TIMESPEC_TO_TIMEVAL(>it_interval, _interval);
+   }
+}
+
 int
 sys_getitimer(struct proc *p, void *v, register_t *retval)
 {
@@ -516,44 +572,16 @@ sys_getitimer(struct proc *p, void *v, r
syscallarg(int) which;
syscallarg(struct itimerval *) itv;
} */ *uap = v;
-   struct itimerspec its;
struct itimerval aitv;
-   struct itimerspec *itimer;
int which;
 
which = 

setpriority(2): booleans are not scalars

2020-09-25 Thread Scott Cheloha
`found' serves as a boolean here.  I'd prefer to simple and set it to
1 instead of incrementing it when we find what we're looking for.

ok?

Index: kern_resource.c
===
RCS file: /cvs/src/sys/kern/kern_resource.c,v
retrieving revision 1.68
diff -u -p -r1.68 kern_resource.c
--- kern_resource.c 15 Jul 2019 20:44:48 -  1.68
+++ kern_resource.c 25 Sep 2020 18:54:34 -
@@ -157,7 +157,7 @@ sys_setpriority(struct proc *curp, void 
if (pr == NULL)
break;
error = donice(curp, pr, SCARG(uap, prio));
-   found++;
+   found = 1;
break;
 
case PRIO_PGRP: {
@@ -169,7 +169,7 @@ sys_setpriority(struct proc *curp, void 
break;
LIST_FOREACH(pr, >pg_members, ps_pglist) {
error = donice(curp, pr, SCARG(uap, prio));
-   found++;
+   found = 1;
}
break;
}
@@ -180,14 +180,14 @@ sys_setpriority(struct proc *curp, void 
LIST_FOREACH(pr, , ps_list)
if (pr->ps_ucred->cr_uid == SCARG(uap, who)) {
error = donice(curp, pr, SCARG(uap, prio));
-   found++;
+   found = 1;
}
break;
 
default:
return (EINVAL);
}
-   if (found == 0)
+   if (!found)
return (ESRCH);
return (error);
 }



Re: amap: panic -> KASSERT

2020-09-25 Thread Scott Cheloha
> On Sep 24, 2020, at 07:43, Theo de Raadt  wrote:
> 
> Mark Kettenis  wrote:
> 
>>> Date: Thu, 24 Sep 2020 11:53:59 +0200
>>> From: Martin Pieuchot 
>>> 
>>> Convert various "if (x) panic()" idioms into "KASSERT(!x)".  The panic
>>> message isn't helping for such sanity checks and this help reducing the
>>> diff with NetBSD.
>>> 
>>> ok?
>> 
>> Yes, the KASSERTs are probably more useful for debugging.  The
>> downside is that we lose the checks in RAMDISK kernels.  The upside of
>> that is that it makes the kernel smaller.
>> 
>> ok kettenis@
> 
> That's the complete assessment of the situation, and on the scale I'm
> happy with the diff.
> 
> ok deraadt

Is there any scenario where

if (condition)
panic();

is preferable to

KASSERT(condition);

outside of function calls with side effects?



top(1): display uptime in HH:MM:SS format

2020-09-18 Thread Scott Cheloha
Hi,

After fussing with top(1)'s uptime display a bit I now think:

- An HH:MM:SS format uptime is useful in top(1).  It's also more
  visually consistent with the local timestamp printed on the line
  above it, so it is easier to read at a glance.

- The variable printing of "days" is annoying.  I would rather it
  just told me "0 days" if it had been less than one day.  It sucks
  when the information you want moves around or isn't shown at all.
  It's clever, sure, but I'd rather it be consistent.

This patch changes the uptime format string to "up D days HH:MM:SS".
The format string does not vary with the elapsed uptime.  There is no
inclusion/omission of the plural suffix depending on whether days is
equal to one.

For example, my machine has been up less than an hour.  Here's the
display in the -current top(1):

load averages:  0.24,  0.16,  0.11jetsam.local 15:30:15
50 processes: 49 idle, 1 on processor  up  0:22

And here's the display in my patched top(1):

load averages:  0.43,  0.24,  0.14jetsam.local 15:31:06
50 processes: 49 idle, 1 on processorup 0 days 00:22:25

I prefer the patched display.  Consistent formatting and more
information.

Maybe you like it too?  Try it out for a day or two.

Thoughts?

Index: display.c
===
RCS file: /cvs/src/usr.bin/top/display.c,v
retrieving revision 1.65
diff -u -p -r1.65 display.c
--- display.c   26 Aug 2020 16:21:28 -  1.65
+++ display.c   18 Sep 2020 20:53:36 -
@@ -212,7 +212,7 @@ static void
 format_uptime(char *buf, size_t buflen)
 {
time_t uptime;
-   int days, hrs, mins;
+   int days, hrs, mins, secs;
struct timespec boottime;
 
/*
@@ -220,18 +220,14 @@ format_uptime(char *buf, size_t buflen)
 */
if (clock_gettime(CLOCK_BOOTTIME, ) != -1) {
uptime = boottime.tv_sec;
-   uptime += 30;
days = uptime / (3600 * 24);
uptime %= (3600 * 24);
hrs = uptime / 3600;
uptime %= 3600;
mins = uptime / 60;
-   if (days > 0)
-   snprintf(buf, buflen, "up %d day%s, %2d:%02d",
-   days, days > 1 ? "s" : "", hrs, mins);
-   else
-   snprintf(buf, buflen, "up %2d:%02d",
-   hrs, mins);
+   secs = uptime % 60;
+   snprintf(buf, buflen, "up %d days %02d:%02d:%02d",
+   days, hrs, mins, secs);
}
 }
 



kstat(1): implement wait with setitimer(2)

2020-09-17 Thread Scott Cheloha
Hi,

Using nanosleep(2) to print the stats periodically causes the period
to drift.  If you use setitimer(2) it won't drift.

ok?

Index: kstat.c
===
RCS file: /cvs/src/usr.bin/kstat/kstat.c,v
retrieving revision 1.6
diff -u -p -r1.6 kstat.c
--- kstat.c 13 Aug 2020 12:37:16 -  1.6
+++ kstat.c 17 Sep 2020 23:24:36 -
@@ -15,6 +15,7 @@
  */
 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -27,6 +28,7 @@
 #include 
 #include 
 
+#include 
 #include 
 #include 
 #include 
@@ -104,6 +106,7 @@ kstat_cmp(const struct kstat_entry *ea, 
 RBT_PROTOTYPE(kstat_tree, kstat_entry, entry, kstat_cmp);
 RBT_GENERATE(kstat_tree, kstat_entry, entry, kstat_cmp);
 
+static voidhandle_alrm(int);
 static struct kstat_filter *
kstat_filter_parse(char *);
 static int kstat_filter_entry(struct kstat_filters *,
@@ -130,20 +133,23 @@ main(int argc, char *argv[])
 {
struct kstat_filters kfs = TAILQ_HEAD_INITIALIZER(kfs);
struct kstat_tree kt = RBT_INITIALIZER();
+   struct itimerval itv;
+   time_t interval;
unsigned int version;
+   sigset_t empty;
int fd;
const char *errstr;
int ch;
-   struct timespec interval = { 0, 0 };
int i;
 
+   interval = 0;
+
while ((ch = getopt(argc, argv, "w:")) != -1) {
switch (ch) {
case 'w':
-   interval.tv_sec = strtonum(optarg, 1, 1,
-   );
+   interval = strtonum(optarg, 1, 1, );
if (errstr != NULL)
-   errx(1, "wait %s: %s", optarg, errstr);
+   errx(1, "wait is %s: %s", optarg, errstr);
break;
default:
usage();
@@ -168,17 +174,31 @@ main(int argc, char *argv[])
kstat_list(, fd, version, );
kstat_print();
 
-   if (interval.tv_sec == 0)
+   if (interval == 0)
return (0);
 
-   for (;;) {
-   nanosleep(, NULL);
+   signal(SIGALRM, handle_alrm);
+   sigemptyset();
+
+   itv.it_value.tv_sec = interval;
+   itv.it_value.tv_usec = 0;
+   itv.it_interval = itv.it_value;
+   if (setitimer(ITIMER_REAL, , NULL) == -1)
+   err(1, "setitimer");
 
+   for (;;) {
+   sigsuspend();
kstat_read(, fd);
kstat_print();
}
 
return (0);
+}
+
+static void
+handle_alrm(int signo)
+{
+   return;
 }
 
 static struct kstat_filter *



Re: systat(1): vmstat: compute rates with CLOCK_UPTIME

2020-09-16 Thread Scott Cheloha
On Wed, Sep 16, 2020 at 01:20:16AM -0600, Theo de Raadt wrote:
> Two days ago during my work on ongoing work for non-acpi suspend,
> kettenis and I observed the same thing.
> 
> your diff works very well for me.

Okay, so I'm not the only one.

Let's do the full patch:

- All rates in the vmstat view are now computed using elapsed
  real time, not elapsed CPU time.

- Measure elapsed real time with  CLOCK_UPTIME and not
  CLOCK_MONOTONIC because we don't care about time spent
  suspended.

- Pass the elapsed time to dinfo() to use when computing
  I/O rates instead of assuming how much time has elapsed.

- Keep drawing the big bar using CPU time.

Would appreciate more tests from people who depend upon systat(1)
regularly.  Does the vmstat view on your machine look reasonable with
this patch?

Pending more tests, ok?

Index: vmstat.c
===
RCS file: /cvs/src/usr.bin/systat/vmstat.c,v
retrieving revision 1.91
diff -u -p -r1.91 vmstat.c
--- vmstat.c28 Jun 2019 13:35:04 -  1.91
+++ vmstat.c17 Sep 2020 00:24:00 -
@@ -83,7 +83,7 @@ staticenum state { BOOT, TIME, RUN } st
 static void allocinfo(struct Info *);
 static void copyinfo(struct Info *, struct Info *);
 static float cputime(int);
-static void dinfo(int, int);
+static void dinfo(int, int, double);
 static void getinfo(struct Info *);
 void putint(int, int, int, int);
 void putintmk(int, int, int, int);
@@ -320,7 +320,7 @@ labelkre(void)
 #define PUTRATE(fld, l, c, w) \
do { \
Y(fld); \
-   putint((int)((float)s.fld/etime + 0.5), l, c, w); \
+   putint((int)((float)s.fld / eruntime + 0.5), l, c, w); \
} while (0)
 #define MAXFAIL 5
 
@@ -330,12 +330,18 @@ staticchar cpuorder[] = { CP_INTR, CP_S
 void
 showkre(void)
 {
+   static struct timespec prev;
+   struct timespec elapsed, now;
float f1, f2;
int psiz;
u_int64_t inttotal, intcnt;
int i, l, c;
static int failcnt = 0, first_run = 0;
-   double etime;
+   double ecputime, eruntime;
+
+   clock_gettime(CLOCK_UPTIME, );
+   timespecsub(, , );
+   prev = now;
 
if (state == TIME) {
if (!first_run) {
@@ -343,12 +349,13 @@ showkre(void)
return;
}
}
-   etime = 0;
+   eruntime = elapsed.tv_sec + elapsed.tv_nsec / 10.0;
+   ecputime = 0;
for (i = 0; i < CPUSTATES; i++) {
X(cpustats.cs_time);
-   etime += s.cpustats.cs_time[i];
+   ecputime += s.cpustats.cs_time[i];
}
-   if (etime < 5.0) {  /* < 5 ticks - ignore this trash */
+   if (ecputime < 5.0) {   /* < 5 ticks - ignore this trash */
if (failcnt++ >= MAXFAIL) {
error("The alternate system clock has died!");
failcnt = 0;
@@ -356,12 +363,12 @@ showkre(void)
return;
}
failcnt = 0;
-   etime /= hertz;
+   ecputime /= hertz;
inttotal = 0;
for (i = 0; i < nintr; i++) {
t = intcnt = s.intrcnt[i];
s.intrcnt[i] -= s1.intrcnt[i];
-   intcnt = (u_int64_t)((float)s.intrcnt[i]/etime + 0.5);
+   intcnt = (u_int64_t)((float)s.intrcnt[i] / eruntime + 0.5);
inttotal += intcnt;
if (intrloc[i] != 0)
putuint64(intcnt, intrloc[i], INTSCOL, 8);
@@ -451,7 +458,7 @@ showkre(void)
mvprintw(DISKROW, DISKCOL + 5 + c,
" %*s", l, dr_name[i]);
c += 1 + l;
-   dinfo(i, c);
+   dinfo(i, c, eruntime);
}
/* and pad the DRIVESPACE */
l = DRIVESPACE - c;
@@ -674,11 +681,9 @@ copyinfo(struct Info *from, struct Info 
 }
 
 static void
-dinfo(int dn, int c)
+dinfo(int dn, int c, double etime)
 {
-   double words, atime, etime;
-
-   etime = naptime;
+   double words, atime;
 
c += DISKCOL;
 



systat(1): vmstat: compute rates with CLOCK_UPTIME

2020-09-15 Thread Scott Cheloha
Hi,

systat(1)'s vmstat view displays rates for things like interrupts.
Strangely, it uses CPU time to compute these rates, not real time.

This is potentially misleading, particularly on an MP system.  If I
have 4 cores running on a HZ=100 kernel I expect ~400 clock interrupts
per second.  But systat(1) tells me I have 100 because we have 4
seconds worth of CPU time for every second of real time that elapses.

I don't like it.  I want to see how many interrupts there really were.

This patch changes the vmstat view to use CLOCK_UPTIME to measure
elapsed time and uses that when computing rates.  The "Big Bar" is
still drawn using CPU time, but for everything else I think you would
want a rate based on the elapsed real time.  Using CPU time isn't
intuitive.

We want CLOCK_UPTIME, not CLOCK_MONOTONIC, because we aren't
interested in what the machine was doing when it was suspended.

I have not changed dinfo() to keep the patch simple, but we should be
using CLOCK_UPTIME there, too.

Thoughts?

Index: vmstat.c
===
RCS file: /cvs/src/usr.bin/systat/vmstat.c,v
retrieving revision 1.91
diff -u -p -r1.91 vmstat.c
--- vmstat.c28 Jun 2019 13:35:04 -  1.91
+++ vmstat.c16 Sep 2020 03:56:14 -
@@ -320,7 +320,7 @@ labelkre(void)
 #define PUTRATE(fld, l, c, w) \
do { \
Y(fld); \
-   putint((int)((float)s.fld/etime + 0.5), l, c, w); \
+   putint((int)((float)s.fld / eruntime + 0.5), l, c, w); \
} while (0)
 #define MAXFAIL 5
 
@@ -330,12 +330,18 @@ staticchar cpuorder[] = { CP_INTR, CP_S
 void
 showkre(void)
 {
+   static struct timespec prev;
+   struct timespec elapsed, now;
float f1, f2;
int psiz;
u_int64_t inttotal, intcnt;
int i, l, c;
static int failcnt = 0, first_run = 0;
-   double etime;
+   double ecputime, eruntime;
+
+   clock_gettime(CLOCK_UPTIME, );
+   timespecsub(, , );
+   prev = now;
 
if (state == TIME) {
if (!first_run) {
@@ -343,12 +349,13 @@ showkre(void)
return;
}
}
-   etime = 0;
+   eruntime = elapsed.tv_sec + elapsed.tv_nsec / 10.0;
+   ecputime = 0;
for (i = 0; i < CPUSTATES; i++) {
X(cpustats.cs_time);
-   etime += s.cpustats.cs_time[i];
+   ecputime += s.cpustats.cs_time[i];
}
-   if (etime < 5.0) {  /* < 5 ticks - ignore this trash */
+   if (ecputime < 5.0) {   /* < 5 ticks - ignore this trash */
if (failcnt++ >= MAXFAIL) {
error("The alternate system clock has died!");
failcnt = 0;
@@ -356,12 +363,12 @@ showkre(void)
return;
}
failcnt = 0;
-   etime /= hertz;
+   ecputime /= hertz;
inttotal = 0;
for (i = 0; i < nintr; i++) {
t = intcnt = s.intrcnt[i];
s.intrcnt[i] -= s1.intrcnt[i];
-   intcnt = (u_int64_t)((float)s.intrcnt[i]/etime + 0.5);
+   intcnt = (u_int64_t)((float)s.intrcnt[i] / eruntime + 0.5);
inttotal += intcnt;
if (intrloc[i] != 0)
putuint64(intcnt, intrloc[i], INTSCOL, 8);



Re: timeout(9): add clock-based timeouts (attempt 2)

2020-09-07 Thread Scott Cheloha
On Sat, Sep 05, 2020 at 01:11:59PM +0200, Mark Kettenis wrote:
> > Date: Fri, 4 Sep 2020 17:55:39 -0500
> > From: Scott Cheloha 
> > 
> > On Sat, Jul 25, 2020 at 08:46:08PM -0500, Scott Cheloha wrote:
> > > 
> > > [...]
> > > 
> > > I want to add clock-based timeouts to the kernel because tick-based
> > > timeouts suffer from a few problems:
> > > 
> > > [...]
> > > 
> > > Basically, ticks are a poor approximation for the system clock.  We
> > > should use the real thing where possible.
> > > 
> > > [...]
> > > 
> > > Thoughts on this approach?  Thoughts on the proposed API?
> > 
> > 6 week bump.
> > 
> > Attached is an rebased and streamlined diff.
> > 
> > Let's try again:
> > 
> > This patch adds support for timeouts scheduled against the hardware
> > timecounter.  I call these "kclock timeouts".  They are distinct from
> > the current tick-based timeouts because ticks are "software time", not
> > "real time".
> 
> So what's the end game here?  Are these kclock-based timeouts going to
> replace the tick-based timeouts at some point in the future?  I can
> see why you want to have both in parallel for a while, but long-term I
> don't think we want to keep both.

Ideally we would replace tick-based timeouts entirely with kclock
timeouts eventually.

There are a few roadblocks, though:

1. The scheduler is tick-based.  If you want to wait until the next
   tick, the easiest way to do that is with timeout_add(9) or tsleep(9).

2. Linux has ktimers, which is tick-based.  drm uses it.  Shouldn't
   we have a tick-based timeout interface for compatibility with them?
   We could fake it, like FreeBSD does, but doing so is probably more
   complicated than just keeping support for tick-based timeouts.

3. Scheduling a timeout with timeout_add(9) is fast.  Scheduling a
   timeout with timeout_in_nsec(9) involves a clock read.  It is slower.
   It is probably too slow for some code.

(1) will be overcome if ever the scheduler is no longer tick-based.

(2) is tricky.  Maybe you or jsg@ have an opinion?

(3) is somewhat easier to fix.  I intend to introduce a TIMEOUT_COARSE
flag in the future which causes timeout_in_nsec() to call
getnanouptime(9) instead of nanouptime(9).  Reading the timestamp is
faster than reading the clock.  You lose accuracy, but any code
worried about the overhead of reading the clock is probably not very
concerned with accuracy.

> We don't really want to do a wholesale conversion of APIs again I'd
> say.  So at some point the existing timeout_add_xxx() calls should be
> implemented in terms of "kclock timeouts".

We can do this, but we'll still need to change the calls that
reschedule a periodic timeout to use the dedicated rescheduling
interface.  Otherwise those periodic timeouts will drift.  They don't
currently drift because a tick is a very coarse unit of time.  With
nanosecond resolution we'll get drift.

> This implementation is still tick driven, so it doesn't really provide
> sub-tick resolution.

Yes, that's right.  Each timeout maintains nanosecond resolution for
its expiration time but will only actually run after hardclock(9) runs
and dumps the timeout to softclock().

We would need to implement a more flexible clock interrupt scheduler
to run timeouts in between hardclocks.

> What does that mean for testing this?  I mean if we spend a lot of time
> now to verify that subsystems can tolerate the more fine-grained timeouts,
> we need to that again when you switch from having a period interrupt driving
> the wheel to having a scheduled interrupt isn't it?

Yes.  But both changes can break things.

I think we should do kclock timeouts before sub-tick timeouts.  The
former is a lot less disruptive than the latter, as the timeouts still
run right after the hardclock.

And you need kclock timeouts to even test sub-tick timeouts anyway.

> > For now we have one kclock, KCLOCK_UPTIME, which corresponds to
> > nanouptime(9).  In the future I intend to add support for runtime and
> > UTC kclocks.
> 
> Do we really need that?  I suppose it helps implementing something
> like clock_nanosleep() with the TIMER_ABSTIME flag for various
> clock_id values?

Exactly.

FreeBSD decided to not support multiple clocks in their revamped
callout(9).  The result is a bit simpler (one clock) but in order to
implement absolute CLOCK_REALTIME sleeps for userspace they have this
flag for each thread that causes the thread to wake up and reschedule
itself whenever settimeofday(2) happens.

It's clever, but it seems messy to me.

I would rather support UTC timeouts as "first class citizens" of the
timeout subsystem.  Linux's hrtimers API supports UTC timeouts
explicitl

Re: amd64: add tsc_delay(), a TSC-based delay(9) implementation

2020-09-05 Thread Scott Cheloha
On Tue, Aug 25, 2020 at 12:22:19PM -0700, Mike Larkin wrote:
> On Tue, Aug 25, 2020 at 12:12:36PM -0700, Mike Larkin wrote:
> > On Mon, Aug 24, 2020 at 01:55:45AM +0200, Mark Kettenis wrote:
> > > > Date: Sun, 23 Aug 2020 18:11:12 -0500
> > > > From: Scott Cheloha 
> > > >
> > > > Hi,
> > > >
> > > > Other BSDs use the TSC to implement delay(9) if the TSC is constant
> > > > and invariant.  Here's a patch to add something similar to our kernel.
> > >
> > > If the TSC is fine as a timecounter it should be absolutely fine for
> > > use as delay().  And we could even use if the TSC isn't synchronized
> > > between CPUs.
> > >
> > > > This patch (or something equivalent) is a prerequisite to running the
> > > > lapic timer in oneshot or TSC deadline mode.  Using the lapic timer to
> > > > implement delay(9) when it isn't running in periodic mode is too
> > > > complicated.  However, using the i8254 for delay(9) is too slow.  We
> > > > need an alternative.
> > >
> > > Hmm, but what are we going to use on machines where the TSC isn't
> > > constant/invariant?
> > >
> > > In what respect is the i8254 too slow?  Does it take more than a
> > > microsecond to read it?
> >
> > It's 3 outb/inb pairs to ensure you get the reading correct. So that could
> > be quite a long time (as cheloha@ points out). Also, that's 6 VM exits if
> > running virtually (I realize that's not the main use case here but just
> > saying...)
> >
> > IIRC the 3 in/out pairs are the latch command followed by reading the 
> > LSB/MSB
> > of the counter. It's not MMIO like the HPET or ACPI timer.
> >
> > And as cheloha@ also points out, it is highly likely that none of us have a
> > real i8254 anymore, much of this is probably implemented in some EC 
> > somewhere
> > and it's unlikely the developer of said EC put a lot of effort into 
> > optimizing
> > the implementation of a legacy device like this.
> >
> > On the topic of virtualization:
> >
> > while (rdtsc() - start < want)
> >  rdtsc();
> 
> I just realized the original diff didn't do two rdtscs. It did a pause inside 
> the
> loop. So the effect is not *as* bad as I described but it's still *somewhat* 
> bad.
> 
> PS - pause loop exiting can be enabled to improve performance in this 
> situation.

What I'm getting from Mike's and kettenis@'s replies is that this is a
generally good idea.

We should add an HPET fallback for the nasty cases where your TSC has
drift because the i8254 is slooow.  But "hpet_delay()" can wait
for a later patch because we haven't switched the lapic into oneshot
mode yet, so lapic_delay() is still useable and fast.

So, is this ok?  Or do I need to tweak something?  I think I'm setting
delay_func to tsc_delay under the right circumstances: we know the TSC
is invariant.

kettenis@: as I mentioned, we need to do the delay_func pointer
comparison in lapic_calibrate_timer() to keep from clobbering
tsc_delay.  We can't compare it with NULL because it is set to
i8254_delay() by default in amd64/machdep.c.

Index: amd64/lapic.c
===
RCS file: /cvs/src/sys/arch/amd64/amd64/lapic.c,v
retrieving revision 1.56
diff -u -p -r1.56 lapic.c
--- amd64/lapic.c   3 Sep 2020 21:38:46 -   1.56
+++ amd64/lapic.c   5 Sep 2020 14:44:08 -
@@ -41,6 +41,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -584,7 +585,8 @@ skip_calibration:
 * Now that the timer's calibrated, use the apic timer routines
 * for all our timing needs..
 */
-   delay_func = lapic_delay;
+   if (delay_func != tsc_delay)
+   delay_func = lapic_delay;
initclock_func = lapic_initclocks;
}
 }
Index: amd64/tsc.c
===
RCS file: /cvs/src/sys/arch/amd64/amd64/tsc.c,v
retrieving revision 1.20
diff -u -p -r1.20 tsc.c
--- amd64/tsc.c 23 Aug 2020 21:38:47 -  1.20
+++ amd64/tsc.c 5 Sep 2020 14:44:08 -
@@ -26,6 +26,7 @@
 
 #include 
 #include 
+#include 
 
 #define RECALIBRATE_MAX_RETRIES5
 #define RECALIBRATE_SMI_THRESHOLD  5
@@ -252,7 +253,8 @@ tsc_timecounter_init(struct cpu_info *ci
tsc_timecounter.tc_quality = -1000;
tsc_timecounter.tc_user = 0;
tsc_is_invariant = 0;
-   }
+   } else
+   delay_func = tsc_delay;
 
tc_init(_timecounter);
 }
@@ -342,4 +344,15 @@ tsc_sync_ap(struct cpu_info *ci)
 {
tsc_p

Re: timeout(9): add clock-based timeouts (attempt 2)

2020-09-04 Thread Scott Cheloha
On Fri, Sep 04, 2020 at 05:55:40PM -0500, Scott Cheloha wrote:
> On Sat, Jul 25, 2020 at 08:46:08PM -0500, Scott Cheloha wrote:
> > 
> > [...]
> > 
> > I want to add clock-based timeouts to the kernel because tick-based
> > timeouts suffer from a few problems:
> > 
> > [...]
> > 
> > Basically, ticks are a poor approximation for the system clock.  We
> > should use the real thing where possible.
> > 
> > [...]
> > 
> > Thoughts on this approach?  Thoughts on the proposed API?
> 
> 6 week bump.
> 
> Attached is an rebased and streamlined diff.
> 
> [...]

Updated diff fixes a pasto.

Index: kern/kern_timeout.c
===
RCS file: /cvs/src/sys/kern/kern_timeout.c,v
retrieving revision 1.79
diff -u -p -r1.79 kern_timeout.c
--- kern/kern_timeout.c 7 Aug 2020 00:45:25 -   1.79
+++ kern/kern_timeout.c 5 Sep 2020 01:54:42 -
@@ -1,4 +1,4 @@
-/* $OpenBSD: kern_timeout.c,v 1.79 2020/08/07 00:45:25 cheloha Exp $   
*/
+/* $OpenBSD: kern_timeout.c,v 1.77 2020/08/01 08:40:20 anton Exp $ */
 /*
  * Copyright (c) 2001 Thomas Nordin 
  * Copyright (c) 2000-2001 Artur Grabowski 
@@ -64,16 +64,27 @@ struct timeoutstat tostat;  /* [T] stati
  * of the global variable "ticks" when the timeout should be called. There are
  * four levels with 256 buckets each.
  */
-#define BUCKETS 1024
+#define WHEELCOUNT 4
 #define WHEELSIZE 256
 #define WHEELMASK 255
 #define WHEELBITS 8
+#define BUCKETS (WHEELCOUNT * WHEELSIZE)
 
-struct circq timeout_wheel[BUCKETS];   /* [T] Queues of timeouts */
+struct circq timeout_wheel[BUCKETS];   /* [T] Tick-based timeouts */
+struct circq timeout_wheel_kc[BUCKETS];/* [T] Clock-based timeouts */
 struct circq timeout_new;  /* [T] New, unscheduled timeouts */
 struct circq timeout_todo; /* [T] Due or needs rescheduling */
 struct circq timeout_proc; /* [T] Due + needs process context */
 
+time_t timeout_level_width[WHEELCOUNT];/* [I] Wheel level width 
(seconds) */
+struct timespec tick_ts;   /* [I] Length of a tick (1/hz secs) */
+
+struct kclock {
+   struct timespec kc_lastscan;/* [T] Clock time at last wheel scan */
+   struct timespec kc_late;/* [T] Late if due prior */
+   struct timespec kc_offset;  /* [T] Offset from primary kclock */
+} timeout_kclock[KCLOCK_MAX];
+
 #define MASKWHEEL(wheel, time) (((time) >> ((wheel)*WHEELBITS)) & WHEELMASK)
 
 #define BUCKET(rel, abs)   \
@@ -155,9 +166,15 @@ struct lock_type timeout_spinlock_type =
((needsproc) ? _sleeplock_obj : _spinlock_obj)
 #endif
 
+void kclock_nanotime(int, struct timespec *);
 void softclock(void *);
 void softclock_create_thread(void *);
+void softclock_process_kclock_timeout(struct timeout *, int);
+void softclock_process_tick_timeout(struct timeout *, int);
 void softclock_thread(void *);
+uint32_t timeout_bucket(struct timeout *);
+uint32_t timeout_maskwheel(uint32_t, const struct timespec *);
+void timeout_run(struct timeout *);
 void timeout_proc_barrier(void *);
 
 /*
@@ -207,13 +224,19 @@ timeout_sync_leave(int needsproc)
 void
 timeout_startup(void)
 {
-   int b;
+   int b, level;
 
CIRCQ_INIT(_new);
CIRCQ_INIT(_todo);
CIRCQ_INIT(_proc);
for (b = 0; b < nitems(timeout_wheel); b++)
CIRCQ_INIT(_wheel[b]);
+   for (b = 0; b < nitems(timeout_wheel_kc); b++)
+   CIRCQ_INIT(_wheel_kc[b]);
+
+   for (level = 0; level < nitems(timeout_level_width); level++)
+   timeout_level_width[level] = 2 << (level * WHEELBITS);
+   NSEC_TO_TIMESPEC(tick_nsec, _ts);
 }
 
 void
@@ -229,25 +252,39 @@ timeout_proc_init(void)
kthread_create_deferred(softclock_create_thread, NULL);
 }
 
+static inline void
+_timeout_set(struct timeout *to, void (*fn)(void *), void *arg, int flags,
+int kclock)
+{
+   to->to_func = fn;
+   to->to_arg = arg;
+   to->to_flags = flags | TIMEOUT_INITIALIZED;
+   to->to_kclock = kclock;
+}
+
 void
 timeout_set(struct timeout *new, void (*fn)(void *), void *arg)
 {
-   timeout_set_flags(new, fn, arg, 0);
+   _timeout_set(new, fn, arg, 0, KCLOCK_NONE);
 }
 
 void
 timeout_set_flags(struct timeout *to, void (*fn)(void *), void *arg, int flags)
 {
-   to->to_func = fn;
-   to->to_arg = arg;
-   to->to_process = NULL;
-   to->to_flags = flags | TIMEOUT_INITIALIZED;
+   _timeout_set(to, fn, arg, flags, KCLOCK_NONE);
 }
 
 void
 timeout_set_proc(struct timeout *new, void (*fn)(void *), void *arg)
 {
-   timeout_set_flags(new, fn, arg, TIMEOUT_PROC);
+   _timeout_set(new, fn, arg, TIMEOUT_PROC, KCLOCK_NONE);
+}
+
+void
+timeout_set_kclock(struct timeout *to, void (*fn)(void *), vo

Re: timeout(9): add clock-based timeouts (attempt 2)

2020-09-04 Thread Scott Cheloha
On Sat, Jul 25, 2020 at 08:46:08PM -0500, Scott Cheloha wrote:
> 
> [...]
> 
> I want to add clock-based timeouts to the kernel because tick-based
> timeouts suffer from a few problems:
> 
> [...]
> 
> Basically, ticks are a poor approximation for the system clock.  We
> should use the real thing where possible.
> 
> [...]
> 
> Thoughts on this approach?  Thoughts on the proposed API?

6 week bump.

Attached is an rebased and streamlined diff.

Let's try again:

This patch adds support for timeouts scheduled against the hardware
timecounter.  I call these "kclock timeouts".  They are distinct from
the current tick-based timeouts because ticks are "software time", not
"real time".

For now we have one kclock, KCLOCK_UPTIME, which corresponds to
nanouptime(9).  In the future I intend to add support for runtime and
UTC kclocks.

Why do we want kclock timeouts at all?

1. Kclock timeouts expire at an actual time, not a tick.  They
   have nanosecond resolution and are NTP-sensitive.  Thus, they
   will *never* fire early.

2. One upshot of nanosecond resolution is that we don't need to
   "round up" to the next tick when scheduling a timeout to prevent
   early execution.  The extra resolution allows us to reduce
   latency in some contexts.

3. Kclock timeouts cover the entire range of the kernel timeline.
   We can remove the "tick loops" like the one sys_nanosleep().

4. Kclock timeouts are scheduled as absolute deadlines.  This makes
   supporting absolute timeouts trivial, which means we can add support
   for clock_nanosleep(2) and the absolute pthread timeouts to the
   kernel.

Kclock timeouts aren't actually used anywhere yet, so merging this
patch will not break anything like last time (CC bluhm@).

In a subsequent diff I will put them to use in tsleep_nsec(9) etc.
This will enable those interfaces to block for less than a tick, which
in turn will allow userspace to block for less than a tick in e.g.
futex(2), and poll(2).  pd@ has verified that this fixes the "time
problem" in OpenBSD vmm(4) VMs (CC pd@).

You initialize kclock timeouts with timeout_set_kclock().  You
schedule them with timeout_in_nsec(), a relative timeout interface
that accepts a count of nanoseconds.  If your timeout is in some
other unit (seconds, milliseconds, whatever) you must convert it
to nanoseconds before scheduling.  Something like this will work:

timeout_in_nsec(_timeout, SEC_TO_NSEC(1));

There won't be a flavored API supporting every conceivable time unit.

In the future I will expose an absolute timeout interface and a
periodic timeout rescheduling interface.  We don't need either of
these interfaces to start, though.

Tick-based timeouts and kclock-based timeouts are *not* compatible.
You cannot schedule a kclock timeout with timeout_add(9).  You cannot
schedule a tick-based timeout with timeout_in_nsec(9).  I have added
KASSERTs to prevent this.

Scheduling a kclock timeout with timeout_in_nsec() is more costly than
scheduling a tick-based timeout with timeout_add(9) because you have
to read the hardware timecounter.  The cost will vary with your clock:
bad clocks have lots of overhead, good clocks have low-to-no overhead.
The programmer will need to decide if the potential overhead is too
high when employing these timeouts.  In most cases the overhead will
not be a problem.  The network stack is one spot where it might be.

Processing the kclock timeout wheel during hardclock(9) adds
negligible overhead to that routine.

Processing a kclock timeout during softclock() is roughly 4 times as
expensive as processing a tick-based timeout.  At idle on my 2Ghz
amd64 machine tick-based timeouts take ~125 cycles to process while
kclock timeouts take ~500 cycles.  The average cost seems to drop as
more kclock timeouts are processed, though I can't really explain why.

Thoughts?  ok?

Index: kern/kern_timeout.c
===
RCS file: /cvs/src/sys/kern/kern_timeout.c,v
retrieving revision 1.79
diff -u -p -r1.79 kern_timeout.c
--- kern/kern_timeout.c 7 Aug 2020 00:45:25 -   1.79
+++ kern/kern_timeout.c 4 Sep 2020 22:41:12 -
@@ -1,4 +1,4 @@
-/* $OpenBSD: kern_timeout.c,v 1.79 2020/08/07 00:45:25 cheloha Exp $   
*/
+/* $OpenBSD: kern_timeout.c,v 1.77 2020/08/01 08:40:20 anton Exp $ */
 /*
  * Copyright (c) 2001 Thomas Nordin 
  * Copyright (c) 2000-2001 Artur Grabowski 
@@ -64,16 +64,27 @@ struct timeoutstat tostat;  /* [T] stati
  * of the global variable "ticks" when the timeout should be called. There are
  * four levels with 256 buckets each.
  */
-#define BUCKETS 1024
+#define WHEELCOUNT 4
 #define WHEELSIZE 256
 #define WHEELMASK 255
 #define WHEELBITS 8
+#define BUCKETS (WHEELCOUNT * WHEELSIZE)
 
-struct circq timeout_wheel[BUCKETS];   /* [T] Queues of timeouts */
+struct circq timeout_wheel[BUCKETS];   /* [T] Tic

Re: amd64: calibrate lapic timer frequency in constant time

2020-09-01 Thread Scott Cheloha
On Tue, Sep 01, 2020 at 06:14:05PM +0200, Mark Kettenis wrote:
> > Date: Tue, 1 Sep 2020 11:05:26 -0500
> > From: Scott Cheloha 
> > 
> > Hi,
> > 
> > At boot, if we don't know the lapic frequency offhand we compute it by
> > waiting for a known clock (the i8254) with a known frequency to cycle
> > a few times.
> > 
> > Currently we cycle hz times.  This doesn't make sense.  There is
> > little to no benefit to waiting additional cycles if your kernel is
> > compiled with a larger HZ.  Mostly it just makes the calibration take
> > longer.
> > 
> > Consider the common HZ=1000 case.  What is the benefit of looping an
> > additional 900 times?  The point of diminishing returns is well under
> > 1000 loops.
> > 
> > 20-50 loops is probably sufficient to limit our error, but I don't
> > want to break anything so let's use 100, like we do on default
> > kernels.
> > 
> > ok?
> 
> Sorry, but this makes no sense to me.  The current code waits for 1
> second regarless of what HZ is.
> 
> And I expect that the accuracy of the measurement depends on the total
> number elapsed time, so I expect a less acurate results if you only
> wait 100 cycles at HZ=1000 (which is 0.1 second).

Whoops.

Yes, you're right, I'm having a slow moment.  Nevermind that last
patch.

But this gives me an idea.

Given that we're waiting the same amount of time (1 second) no matter
the value of hz, and we're using the same clock to do the wait... why
are we fussing with the inner workings of reading the i8254?  We have
a function for this: i8254_delay().

If we just use that and tell it to wait a million microseconds we can
throw out a bunch of code.

Index: lapic.c
===
RCS file: /cvs/src/sys/arch/amd64/amd64/lapic.c,v
retrieving revision 1.55
diff -u -p -r1.55 lapic.c
--- lapic.c 3 Aug 2019 14:57:51 -   1.55
+++ lapic.c 1 Sep 2020 16:40:39 -
@@ -451,24 +451,6 @@ lapic_initclocks(void)
i8254_inittimecounter_simple();
 }
 
-
-extern int gettick(void);  /* XXX put in header file */
-extern u_long rtclock_tval; /* XXX put in header file */
-
-static __inline void
-wait_next_cycle(void)
-{
-   unsigned int tick, tlast;
-
-   tlast = (1 << 16);  /* i8254 counter has 16 bits at most */
-   for (;;) {
-   tick = gettick();
-   if (tick > tlast)
-   return;
-   tlast = tick;
-   }
-}
-
 /*
  * Calibrate the local apic count-down timer (which is running at
  * bus-clock speed) vs. the i8254 counter/timer (which is running at
@@ -484,7 +466,7 @@ void
 lapic_calibrate_timer(struct cpu_info *ci)
 {
unsigned int startapic, endapic;
-   u_int64_t dtick, dapic, tmp;
+   u_int64_t tmp;
u_long s;
int i;
 
@@ -504,29 +486,13 @@ lapic_calibrate_timer(struct cpu_info *c
 
s = intr_disable();
 
-   /* wait for current cycle to finish */
-   wait_next_cycle();
-
startapic = lapic_gettick();
-
-   /* wait the next hz cycles */
-   for (i = 0; i < hz; i++)
-   wait_next_cycle();
-
+   i8254_delay(100);
endapic = lapic_gettick();
 
intr_restore(s);
 
-   dtick = hz * rtclock_tval;
-   dapic = startapic-endapic;
-
-   /*
-* there are TIMER_FREQ ticks per second.
-* in dtick ticks, there are dapic bus clocks.
-*/
-   tmp = (TIMER_FREQ * dapic) / dtick;
-
-   lapic_per_second = tmp;
+   lapic_per_second = startapic - endapic;
 
 skip_calibration:
printf("%s: apic clock running at %dMHz\n",



amd64: calibrate lapic timer frequency in constant time

2020-09-01 Thread Scott Cheloha
Hi,

At boot, if we don't know the lapic frequency offhand we compute it by
waiting for a known clock (the i8254) with a known frequency to cycle
a few times.

Currently we cycle hz times.  This doesn't make sense.  There is
little to no benefit to waiting additional cycles if your kernel is
compiled with a larger HZ.  Mostly it just makes the calibration take
longer.

Consider the common HZ=1000 case.  What is the benefit of looping an
additional 900 times?  The point of diminishing returns is well under
1000 loops.

20-50 loops is probably sufficient to limit our error, but I don't
want to break anything so let's use 100, like we do on default
kernels.

ok?

Index: lapic.c
===
RCS file: /cvs/src/sys/arch/amd64/amd64/lapic.c,v
retrieving revision 1.55
diff -u -p -r1.55 lapic.c
--- lapic.c 3 Aug 2019 14:57:51 -   1.55
+++ lapic.c 1 Sep 2020 15:58:41 -
@@ -509,15 +509,15 @@ lapic_calibrate_timer(struct cpu_info *c
 
startapic = lapic_gettick();
 
-   /* wait the next hz cycles */
-   for (i = 0; i < hz; i++)
+   /* wait a few cycles */
+   for (i = 0; i < 100; i++)
wait_next_cycle();
 
endapic = lapic_gettick();
 
intr_restore(s);
 
-   dtick = hz * rtclock_tval;
+   dtick = 100 * rtclock_tval;
dapic = startapic-endapic;
 
/*



Re: getitimer(2), setitimer(2): merge critical sections

2020-09-01 Thread Scott Cheloha
On Mon, Aug 17, 2020 at 05:55:34PM -0500, Scott Cheloha wrote:
> 
> [...]

Two week bump.

In summary:

- Merge the critical sections so that "timer swap" with setitimer(2)
  is atomic.

- To do this, move error-free operations into a common kernel
  subroutine, setitimer().  Now we have one critical section.

- Leave error-prone operations in sys_getitimer() and sys_setitimer().

In order to make the "timer swap" atomic we leave the timer installed
if the copyout(9) fails.  This isn't great, but it is unavoidable
without permitting copyout(9) from within a mutex.  FreeBSD and Linux
went this direction, too.  I would rather leave the timer running and
have an atomic swap than race the hardclock(9) or the realitexpire()
timeout.

ok?

Index: kern_time.c
===
RCS file: /cvs/src/sys/kern/kern_time.c,v
retrieving revision 1.140
diff -u -p -r1.140 kern_time.c
--- kern_time.c 12 Aug 2020 15:31:27 -  1.140
+++ kern_time.c 1 Sep 2020 15:19:07 -
@@ -491,7 +491,7 @@ out:
 struct mutex itimer_mtx = MUTEX_INITIALIZER(IPL_CLOCK);
 
 /*
- * Get value of an interval timer.  The process virtual and
+ * Get and/or set value of an interval timer.  The process virtual and
  * profiling virtual time timers are kept internally in the
  * way they are specified externally: in time until they expire.
  *
@@ -509,6 +509,63 @@ struct mutex itimer_mtx = MUTEX_INITIALI
  * real time timers .it_interval.  Rather, we compute the next time in
  * absolute time the timer should go off.
  */
+void
+setitimer(int which, struct itimerval *itv, struct itimerval *olditv)
+{
+   struct itimerspec its, oldits;
+   struct itimerspec *itimer;
+   struct process *pr;
+   int timo;
+
+   KASSERT(which >= ITIMER_REAL && which <= ITIMER_PROF);
+
+   pr = curproc->p_p;
+   itimer = >ps_timer[which];
+
+   if (itv != NULL) {
+   TIMEVAL_TO_TIMESPEC(>it_value, _value);
+   TIMEVAL_TO_TIMESPEC(>it_interval, _interval);
+   }
+
+   if (which != ITIMER_REAL)
+   mtx_enter(_mtx);
+
+   if (olditv != NULL)
+   oldits = *itimer;
+   if (itv != NULL) {
+   if (which == ITIMER_REAL) {
+   struct timespec cts;
+   getnanouptime();
+   if (timespecisset(_value)) {
+   timo = tstohz(_value);
+   timeout_add(>ps_realit_to, timo);
+   timespecadd(_value, , _value);
+   } else
+   timeout_del(>ps_realit_to);
+   }
+   *itimer = its;
+   }
+
+   if (which != ITIMER_REAL)
+   mtx_leave(_mtx);
+
+   if (olditv != NULL) {
+   if (which == ITIMER_REAL) {
+   struct timespec now;
+   getnanouptime();
+   if (timespecisset(_value)) {
+   if (timespeccmp(_value, , <))
+   timespecclear(_value);
+   else
+   timespecsub(_value, ,
+   _value);
+   }
+   }
+   TIMESPEC_TO_TIMEVAL(>it_value, _value);
+   TIMESPEC_TO_TIMEVAL(>it_interval, _interval);
+   }
+}
+
 int
 sys_getitimer(struct proc *p, void *v, register_t *retval)
 {
@@ -516,44 +573,16 @@ sys_getitimer(struct proc *p, void *v, r
syscallarg(int) which;
syscallarg(struct itimerval *) itv;
} */ *uap = v;
-   struct itimerspec its;
struct itimerval aitv;
-   struct itimerspec *itimer;
int which;
 
which = SCARG(uap, which);
 
if (which < ITIMER_REAL || which > ITIMER_PROF)
return (EINVAL);
-   itimer = >p_p->ps_timer[which];
memset(, 0, sizeof(aitv));
 
-   if (which != ITIMER_REAL)
-   mtx_enter(_mtx);
-   its = *itimer;
-   if (which != ITIMER_REAL)
-   mtx_leave(_mtx);
-
-   if (which == ITIMER_REAL) {
-   struct timespec now;
-
-   getnanouptime();
-   /*
-* Convert from absolute to relative time in .it_value
-* part of real time timer.  If time for real time timer
-* has passed return 0, else return difference between
-* current time and time for the timer to go off.
-*/
-   if (timespecisset(_value)) {
-   if (timespeccmp(_value, , <))
-   timespecclear(_value);
-   else
-   timespecsub(_value, ,
-   _value);
-   }

Re: amd64: add tsc_delay(), a TSC-based delay(9) implementation

2020-08-23 Thread Scott Cheloha
On Sun, Aug 23, 2020 at 11:45:22PM -0500, Scott Cheloha wrote:
> 
> [...]
> 
> > > This patch (or something equivalent) is a prerequisite to running the
> > > lapic timer in oneshot or TSC deadline mode.  Using the lapic timer to
> > > implement delay(9) when it isn't running in periodic mode is too
> > > complicated.  However, using the i8254 for delay(9) is too slow.  We
> > > need an alternative.
> > 
> > Hmm, but what are we going to use on machines where the TSC isn't
> > constant/invariant?
> 
> Probably fall back on the i8254?  Unless someone wants to add yet
> another delay(9) implementation to amd64...
> 
> > In what respect is the i8254 too slow?  Does it take more than a
> > microsecond to read it?
> 
> On my machine, the portion of gettick() *within* the mutex runs in ~19
> microseconds.
> 
> That's before any overhead from mtx_enter(9).  I think having multiple
> threads in delay(9) should be relatively rare, but you have to keep
> that in mind.
> 
> No idea what the overhead would look like on real hardware.  I'm
> pretty sure my i8254 is emulated.
> 
> > We could use the HPET I suppose, whic may be a bit better.
> 
> It's better.  No mutex.  On my machine it takes ~11 microseconds.
> It's a start.

Hmmm, now I'm worried I have screwed something up or misconfigured
something.

It doesn't seem right that it would take 20K cycles to read the HPET
on this machine.

Am I way off?  Or is 20K actually a reasonable number?

For comparison, lapic_gettick() completes in... 80 nanoseconds (?) on
the same machine.  Relevant sysctls:

$ sysctl hw.{model,setperf,perfpolicy} machdep.{tscfreq,invarianttsc}
hw.model=Intel(R) Core(TM) i7-8650U CPU @ 1.90GHz
hw.setperf=100
hw.perfpolicy=high
machdep.tscfreq=211200
machdep.invarianttsc=1

... if it really takes that long, then "high precision" is a bit of a
misnomer.



Re: amd64: add tsc_delay(), a TSC-based delay(9) implementation

2020-08-23 Thread Scott Cheloha
On Mon, Aug 24, 2020 at 01:55:45AM +0200, Mark Kettenis wrote:
> > Date: Sun, 23 Aug 2020 18:11:12 -0500
> > From: Scott Cheloha 
> > 
> > Hi,
> > 
> > Other BSDs use the TSC to implement delay(9) if the TSC is constant
> > and invariant.  Here's a patch to add something similar to our kernel.
> 
> If the TSC is fine as a timecounter it should be absolutely fine for
> use as delay().  And we could even use if the TSC isn't synchronized
> between CPUs.

Yep, it's nice.

> > This patch (or something equivalent) is a prerequisite to running the
> > lapic timer in oneshot or TSC deadline mode.  Using the lapic timer to
> > implement delay(9) when it isn't running in periodic mode is too
> > complicated.  However, using the i8254 for delay(9) is too slow.  We
> > need an alternative.
> 
> Hmm, but what are we going to use on machines where the TSC isn't
> constant/invariant?

Probably fall back on the i8254?  Unless someone wants to add yet
another delay(9) implementation to amd64...

> In what respect is the i8254 too slow?  Does it take more than a
> microsecond to read it?

On my machine, the portion of gettick() *within* the mutex runs in ~19
microseconds.

That's before any overhead from mtx_enter(9).  I think having multiple
threads in delay(9) should be relatively rare, but you have to keep
that in mind.

No idea what the overhead would look like on real hardware.  I'm
pretty sure my i8254 is emulated.

> We could use the HPET I suppose, whic may be a bit better.

It's better.  No mutex.  On my machine it takes ~11 microseconds.
It's a start.

> > As for the patch, it works for me here, though I'd appreciate a few
> > tests.  I admit that comparing function pointers is ugly, but I think
> > this is as simple as it can be without implementing some sort of
> > framework for "registering" delay(9) implementations and comparing
> > them and selecting the "best" implementation.
> 
> What about:
> 
>   if (delay_func == NULL)
>   delay_func = lapic_delay;

Nah, can't do that.  delay_func is initialized to i8254_delay().  Look
in amd64/machdep.c.

I'm curious what NetBSD and Dragonfly have done about this.  Lemme
look around.

The whole "all the clocks on amd64 are slow or broken" issue isn't
unique to us.



amd64: add tsc_delay(), a TSC-based delay(9) implementation

2020-08-23 Thread Scott Cheloha
Hi,

Other BSDs use the TSC to implement delay(9) if the TSC is constant
and invariant.  Here's a patch to add something similar to our kernel.

This patch (or something equivalent) is a prerequisite to running the
lapic timer in oneshot or TSC deadline mode.  Using the lapic timer to
implement delay(9) when it isn't running in periodic mode is too
complicated.  However, using the i8254 for delay(9) is too slow.  We
need an alternative.

As for the patch, it works for me here, though I'd appreciate a few
tests.  I admit that comparing function pointers is ugly, but I think
this is as simple as it can be without implementing some sort of
framework for "registering" delay(9) implementations and comparing
them and selecting the "best" implementation.

I'm not sure I put the prototypes in the right headers.  We don't have
a tsc.h but cpuvar.h looks sorta-correct for tsc_delay().

FreeBSD's x86/delay.c may be of note:

https://github.com/freebsd/freebsd/blob/ed96335a07b688c39e16db8856232e5840bc22ac/sys/x86/x86/delay.c

Thoughts?

Index: amd64/tsc.c
===
RCS file: /cvs/src/sys/arch/amd64/amd64/tsc.c,v
retrieving revision 1.20
diff -u -p -r1.20 tsc.c
--- amd64/tsc.c 23 Aug 2020 21:38:47 -  1.20
+++ amd64/tsc.c 23 Aug 2020 22:59:25 -
@@ -26,6 +26,7 @@
 
 #include 
 #include 
+#include 
 
 #define RECALIBRATE_MAX_RETRIES5
 #define RECALIBRATE_SMI_THRESHOLD  5
@@ -252,7 +253,8 @@ tsc_timecounter_init(struct cpu_info *ci
tsc_timecounter.tc_quality = -1000;
tsc_timecounter.tc_user = 0;
tsc_is_invariant = 0;
-   }
+   } else
+   delay_func = tsc_delay;
 
tc_init(_timecounter);
 }
@@ -342,4 +344,15 @@ tsc_sync_ap(struct cpu_info *ci)
 {
tsc_post_ap(ci);
tsc_post_ap(ci);
+}
+
+void
+tsc_delay(int usecs)
+{
+   uint64_t interval, start;
+
+   interval = (uint64_t)usecs * tsc_frequency / 100;
+   start = rdtsc_lfence();
+   while (rdtsc_lfence() - start < interval)
+   CPU_BUSY_CYCLE();
 }
Index: amd64/lapic.c
===
RCS file: /cvs/src/sys/arch/amd64/amd64/lapic.c,v
retrieving revision 1.55
diff -u -p -r1.55 lapic.c
--- amd64/lapic.c   3 Aug 2019 14:57:51 -   1.55
+++ amd64/lapic.c   23 Aug 2020 22:59:25 -
@@ -41,6 +41,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -569,7 +570,8 @@ skip_calibration:
 * Now that the timer's calibrated, use the apic timer routines
 * for all our timing needs..
 */
-   delay_func = lapic_delay;
+   if (delay_func != tsc_delay)
+   delay_func = lapic_delay;
initclock_func = lapic_initclocks;
}
 }
Index: include/cpuvar.h
===
RCS file: /cvs/src/sys/arch/amd64/include/cpuvar.h,v
retrieving revision 1.10
diff -u -p -r1.10 cpuvar.h
--- include/cpuvar.h9 Aug 2019 15:20:05 -   1.10
+++ include/cpuvar.h23 Aug 2020 22:59:25 -
@@ -102,4 +102,6 @@ void tsc_sync_drift(int64_t);
 void tsc_sync_bp(struct cpu_info *);
 void tsc_sync_ap(struct cpu_info *);
 
+void tsc_delay(int);
+
 #endif
Index: include/i82489var.h
===
RCS file: /cvs/src/sys/arch/amd64/include/i82489var.h,v
retrieving revision 1.18
diff -u -p -r1.18 i82489var.h
--- include/i82489var.h 4 Oct 2018 05:00:40 -   1.18
+++ include/i82489var.h 23 Aug 2020 22:59:26 -
@@ -128,4 +128,6 @@ extern void lapic_calibrate_timer(struct
 extern void lapic_startclock(void);
 extern void lapic_initclocks(void);
 
+extern void lapic_delay(int);
+
 #endif



Re: timekeep: fixing large skews on amd64 with RDTSCP

2020-08-22 Thread Scott Cheloha
On Tue, Jul 28, 2020 at 10:02:07AM +0300, Paul Irofti wrote:
> 
> [...]
> 
> Is the issue with LFENCE slowing down the network stack settled? That was
> the argument against it last time.

... a month passes.  Nobody says anything.

This "it might slow down the network stack" thing keeps coming up, and
yet nobody can point to (a) who expressed this concern or (b) what the
penalty is in practice.

Note that the alternative is "your timecounter might not be monotonic
between threads".  For me, that's already a dealbreaker.

But for sake of discussion let's look at some data.  For those of you
watching from home, please follow along!  I would like to know what
your results look like.

To start, here is a microbenchmarking program for clock_gettime(2) on
amd64.  If you have the userspace timecounter, then

clock_gettime(CLOCK_MONOTONIC, ...);

is a suitable surrogate for nanouptime(9), so this microbenchmark can
actually tell us about how nanouptime(9) or nanotime(9) would be
impacted by a comparable change in the kernel timecounter.

--

/*
 * clock_gettime-bench.c
 */
#include 
#include 
#include 
#include 
#include 

static uint64_t
rdtsc_lfence(void)
{
uint32_t hi, lo;

__asm volatile("lfence; rdtsc; lfence" : "=d" (hi), "=a" (lo));
return ((uint64_t)hi << 32) | lo;
}

int
main(int argc, char *argv[])
{
struct timespec now;
uint64_t begin, end;
long long count, i;
const char *errstr;

if (argc != 2) {
fprintf(stderr, "usage: %s count\n", getprogname());
return 1;
}
count = strtonum(argv[1], 1, LLONG_MAX, );
if (errstr != NULL)
errx(1, "count is %s: %s", errstr, argv[1]);

begin = rdtsc_lfence();
for (i = 0; i < count; i++)
clock_gettime(CLOCK_MONOTONIC, );
end = rdtsc_lfence();

printf("%lld\t%llu\n", count, end - begin);

return 0;
}

--

Now consider a benchmark of 100K clock_gettime(2) calls against the
userspace timecounter.

$ clock_gettime-bench 10
10  15703664

Let's collect 10K of these benchmarks -- our samples -- atop an
unpatched libc.  Use the shell script below.  Note that we throw out
samples where we hit a context switch.

--

#! /bin/sh

[ $# -ne 1 ] && exit 1
RESULTS=$1
shift

TIME=$(mktemp) || exit 1
TMP=$(mktemp) || exit 1

# Collect 10K samples.
i=0
while [ $i -lt 1 ]; do
# Call clock_gettime(2) 100K times.
/usr/bin/time -l ~/scratch/clock_gettime-bench 10 > $TMP 2> $TIME
# Ignore this sample if a context switch occurred.
if egrep -q '[1-9][0-9]* +(in)?voluntary context' $TIME; then
continue
fi
cat $TMP >> $RESULTS
i=$((i + 1))
done

rm $TMP $TIME

--

Run it like this:

$ ksh bench.sh unpatched.out

That will take ~5-10 minutes at most.

Next, we'll patch libc to add the LFENCE to the userspace timecounter.

Index: usertc.c
===
RCS file: /cvs/src/lib/libc/arch/amd64/gen/usertc.c,v
retrieving revision 1.2
diff -u -p -r1.2 usertc.c
--- usertc.c8 Jul 2020 09:17:48 -   1.2
+++ usertc.c22 Aug 2020 22:18:47 -
@@ -19,10 +19,10 @@
 #include 
 
 static inline u_int
-rdtsc(void)
+rdtsc_lfence(void)
 {
uint32_t hi, lo;
-   asm volatile("rdtsc" : "=a"(lo), "=d"(hi));
+   asm volatile("lfence; rdtsc" : "=a"(lo), "=d"(hi));
return ((uint64_t)lo)|(((uint64_t)hi)<<32);
 }
 
@@ -31,7 +31,7 @@ tc_get_timecount(struct timekeep *tk, u_
 {
switch (tk->tk_user) {
case TC_TSC:
-   *tc = rdtsc();
+   *tc = rdtsc_lfence();
return 0;
}
 
--

Recompile and reinstall libc.

Then rerun the benchmark.  Be careful not to overwrite our results
from the unpatched libc:

$ ksh bench.sh patched.out

--

Alright, now let's compare the results.  I'm not a mathemagician so I
use ministat and trust it implicitly.  A stat jock could probably do
this in R or with some python, but I am not that clever, so I will
stick with ministat.

There is no ministat port for OpenBSD, but it is pretty trivial to
clone this github repo and build it on -current:

https://github.com/thorduri/ministat

--

Okay, you have ministat?

Let's compare the results.  We want the 2nd column in the output
(-C2).  I'm not interested in the graph (-q), given our population
size.  We have N=1, so let's push the CI up (-c 99.5).

$ ~/repo/ministat/ministat -C2 -q -c99.5 unpatched.out patched.out
x unpatched.out
+ patched.out
N   Min   MaxMedian   AvgStddev
x 1  13752102  18019218  14442398  14431918 237842.31
+ 1  15196970  16992030  15721390  15779178 181623.5
Difference at 99.5% confidence
1.34726e+06 +/- 9247.11
9.33528% +/- 0.064074%
(Student's t, pooled s = 211608)

So, in 

ldapd(8): fix, simplify UUID timestamp code

2020-08-19 Thread Scott Cheloha
Hi,

I was auditing the tree for odd-looking time structure usage and I
came across the UUID code in ldapd(8), uuid.c.

time_cmp() is backwards.  Or the caller is misusing it.  One or the
other.  It returns -1 if tv1 exceeds tv2 but the comments in the
caller indicate the opposite impression.  I don't think this code has
ever worked as intended.

It would be a lot easier if we just threw the code out and used random
UUIDs.  After reading over the RFC it seems to me that time-based
UUIDs are collision-prone.  Their implementation is also complicated.
Purely random UUIDs should effectively never collide and are trivial
to implement.

However, assuming we can't just use random UUIDs, here's an attempt at
improving this code:

- Use clock_gettime(2).  With nanosecond resolution we don't need
  a 'counter'.

- Reduce the scope of all the static state to uuid_create().

- Shrink the loop.  Just read the clock until it changes, then decide
  what to do re. seq_num.  This is effectively what the example code in
  RFC 4122 does.

I'm unsure what the right thing to do is if the system clock predates
the UUID epoch (Oct 15 1582).  My code just returns zero.  Maybe we
should just kill the daemon in that case?  The UUIDv1 scheme breaks
down if time is that seriously screwed up.

Is there an active ldapd(8) person?  Or at least someone with an
ldapd(8) setup who can test this?

Thoughts?

Index: uuid.c
===
RCS file: /cvs/src/usr.sbin/ldapd/uuid.c,v
retrieving revision 1.6
diff -u -p -r1.6 uuid.c
--- uuid.c  26 Apr 2018 12:42:51 -  1.6
+++ uuid.c  20 Aug 2020 01:44:00 -
@@ -63,27 +63,8 @@
 
 #include "uuid.h"
 
-static uint32_t seq_num;
-static struct timeval last_time;
-static int32_t counter;
-static char nodeaddr[6];
-
 enum { UUID_NODE_MULTICAST = 0x80 };
 
-static int
-time_cmp(struct timeval *tv1, struct timeval *tv2)
-{
-if (tv1->tv_sec > tv2->tv_sec)
-   return -1;
-if (tv1->tv_sec < tv2->tv_sec)
-   return 1;
-if (tv1->tv_usec > tv2->tv_usec)
-   return -1;
-if (tv1->tv_usec < tv2->tv_usec)
-   return 1;
-return 0;
-}
-
 static void
 get_node_addr(char *addr)
 {
@@ -138,6 +119,40 @@ get_node_addr(char *addr)
 }
 
 /*
+ * A UUID v1 timestamp:
+ *
+ * - 60 bits.
+ * - Unsigned.
+ * - Epoch at Oct 15 1582 00:00:00 UTC.
+ * - Increments every 100 nanoseconds.
+ */
+#define UUID_EPOCH_OFFSET  12219292800LL
+#define UUID_TIME_MAX  (1ULL << 60)
+#define UUID_HZ1000LL
+#define NSEC_PER_UUID_TICK 100LL
+
+static uint64_t
+get_uuid_timestamp(void)
+{
+   static const struct timespec min = { -UUID_EPOCH_OFFSET, 0 };
+   static const struct timespec max = {
+   UUID_TIME_MAX / UUID_HZ,
+   UUID_TIME_MAX % UUID_HZ * NSEC_PER_UUID_TICK
+   };
+   struct timespec utc;
+   uint64_t timestamp;
+
+   clock_gettime(CLOCK_REALTIME, );
+   if (timespeccmp(, , <))
+   return 0;
+   if (timespeccmp(, , <))
+   return UUID_TIME_MAX;
+   timestamp = (UUID_EPOCH_OFFSET + utc.tv_sec) * UUID_HZ;
+   timestamp += utc.tv_nsec / NSEC_PER_UUID_TICK;
+   return timestamp;
+}
+
+/*
  *Creates a new UUID.
  */
 
@@ -145,55 +160,32 @@ void
 uuid_create(afsUUID *uuid)
 {
 static int uuid_inited = 0;
-struct timeval tv;
-int ret, got_time;
+static uint64_t last_time;
+static uint32_t seq_num;
+static char nodeaddr[6];
 uint64_t dce_time;
 
 if (uuid_inited == 0) {
-   gettimeofday(_time, NULL);
+   last_time = get_uuid_timestamp();
seq_num = arc4random();
get_node_addr(nodeaddr);
uuid_inited = 1;
 }
 
-gettimeofday(, NULL);
-
-got_time = 0;
+while ((dce_time = get_uuid_timestamp()) == last_time)
+   continue;
 
-do {
-   ret = time_cmp(, _time);
-   if (ret < 0) {
-   /* Time went backward, just inc seq_num and be done.
-* seq_num is 6 + 8 bit field it the uuid, so let it wrap
-* around. don't let it be zero.
-*/
-   seq_num = (seq_num + 1) & 0x3fff ;
-   if (seq_num == 0)
-   seq_num++;
-   got_time = 1;
-   counter = 0;
-   last_time = tv;
-   } else if (ret > 0) {
-   /* time went forward, reset counter and be happy */
-   last_time = tv;
-   counter = 0;
-   got_time = 1;
-   } else {
-#define UUID_MAX_HZ (1) /* make this bigger fix you have larger tickrate */
-#define MULTIPLIER_100_NANO_SEC 10
-   if (++counter < UUID_MAX_HZ * MULTIPLIER_100_NANO_SEC)
-   got_time = 1;
-   }
-} while(!got_time);
+if (dce_time < last_time) {
+   /* Time went backward, just inc seq_num and be done.
+* seq_num is 6 + 8 bit field it the uuid, so let it wrap
+* around. don't let it be zero.
+*/
+   seq_num = (seq_num + 

Re: getitimer(2), setitimer(2): merge critical sections

2020-08-17 Thread Scott Cheloha
On Mon, Aug 17, 2020 at 12:57:33PM -0600, Theo de Raadt wrote:
> Scott Cheloha  wrote:
> 
> > There is one behavior change: in the setitimer(2) swap case it is now
> > possible to EFAULT on copyout(9) *after* you have written the new
> > timer value and (possibly) started the ITIMER_REAL timeout.
> > 
> > For example, the following code now yields EFAULT even though a new
> > oneshot timer has been started successfully.
> > 
> > struct itimerval new;
> > int error;
> > 
> > new.it_value.tv_sec = 1;
> > new.it_value.tv_usec = 0;
> > timerclear(_interval);
> > error = setitimer(ITIMER_REAL, , 0xdeadbeef);
> > if (error)
> > warn("setitimer");
> > 
> > I don't think there is a way to avoid this without introducing a bunch
> > of extra complexity.  The critical section is protected by a mutex and
> > copyout(9) can sleep, so we have to wait until we leave the critical
> > section to copyout(9).  If we leave the mutex to do the copyout(9)
> > before writing the new timer value then the swap is no longer atomic.
> > Of course, this is not an issue *now*, but when the syscalls are
> > unlocked you will lose atomicity.
> > 
> > Personally I don't think this is a huge deal.  If you're getting
> > EFAULT there is a bigger problem in your code.
> 
> Let's go back to this first mail.
> 
> I suspect it is OK to update the timout, even if the final copyout (and
> syscall) then returns EFAULT.
> 
> It looks like historical 4.4BSD was "copyout then update".  FreeBSD is
> "update them copyout".
> 
> I certainly don't think it is worthwhile creating a problem which
> is somewhat similar to a TOCTOU.  Even your proposal to do the
> address-space range check is a TOCTOU, it fixes nothing since the
> address space can still be flipped).
> 
> What do other systems do?

FreeBSD and Linux do "update then copyout".  They don't do any cleanup
if the copyout fails, i.e. the new timer is left running.

DragonflyBSD, NetBSD, and illumos/Solaris do "copyout then update".
They have separate critical sections for reading and writing the
timer.  This means the timer is *not* installed if there is a copyout
error but that the "swap" is not atomic: there is a race after you
read the timer where the timeout (or hardclock(9)) can update the
timer after you have read it.

Near as I can tell you can't have it both ways without permitting
copyout from within the critical section.

Assuming that isn't allowed, I would rather have a single critical
section.

Index: kern_time.c
===
RCS file: /cvs/src/sys/kern/kern_time.c,v
retrieving revision 1.140
diff -u -p -r1.140 kern_time.c
--- kern_time.c 12 Aug 2020 15:31:27 -  1.140
+++ kern_time.c 17 Aug 2020 22:47:48 -
@@ -491,7 +491,7 @@ out:
 struct mutex itimer_mtx = MUTEX_INITIALIZER(IPL_CLOCK);
 
 /*
- * Get value of an interval timer.  The process virtual and
+ * Get and/or set value of an interval timer.  The process virtual and
  * profiling virtual time timers are kept internally in the
  * way they are specified externally: in time until they expire.
  *
@@ -509,6 +509,63 @@ struct mutex itimer_mtx = MUTEX_INITIALI
  * real time timers .it_interval.  Rather, we compute the next time in
  * absolute time the timer should go off.
  */
+void
+setitimer(int which, struct itimerval *itv, struct itimerval *olditv)
+{
+   struct itimerspec its, oldits;
+   struct itimerspec *itimer;
+   struct process *pr;
+   int timo;
+
+   KASSERT(which >= ITIMER_REAL && which <= ITIMER_PROF);
+
+   pr = curproc->p_p;
+   itimer = >ps_timer[which];
+
+   if (itv != NULL) {
+   TIMEVAL_TO_TIMESPEC(>it_value, _value);
+   TIMEVAL_TO_TIMESPEC(>it_interval, _interval);
+   }
+
+   if (which != ITIMER_REAL)
+   mtx_enter(_mtx);
+
+   if (olditv != NULL)
+   oldits = *itimer;
+   if (itv != NULL) {
+   if (which == ITIMER_REAL) {
+   struct timespec cts;
+   getnanouptime();
+   if (timespecisset(_value)) {
+   timo = tstohz(_value);
+   timeout_add(>ps_realit_to, timo);
+   timespecadd(_value, , _value);
+   } else
+   timeout_del(>ps_realit_to);
+   }
+   *itimer = its;
+   }
+
+   if (which != ITIMER_REAL)
+   mtx_leave(_mtx);
+
+   if (olditv != NULL) {
+   if (which == ITIMER_REAL) {
+   struct timespec now;
+

Re: getitimer(2), setitimer(2): merge critical sections

2020-08-17 Thread Scott Cheloha
On Fri, Aug 14, 2020 at 06:11:25PM -0600, Theo de Raadt wrote:
> > It has occurred to me that we could do a trial copyout(9) in
> > sys_setitimer() before entering the critical section.  This is a *bit*
> > wasteful, but is relatively inexpensive and narrows the behavior
> > change I mentioned down to truly improbable cases involving multiple
> > threads and munmap(2).
> 
> That sounds scary.  You are touching userland memory twice, and I could
> imagine a situation where it isn't the same memory because it gets
> swapped out in a shared storage situation.
> 
> It sounds terribly wrong to do that.

Is there a way to check that a uaddr_t range is writable by a given
process without actually calling copyout(9) and checking for EFAULT?

I'm not familiar with uvm(9), but uvm_map_checkprot(9) seems to do
what I'm looking for.

Of course, no other syscalls use it for this purpose, so I'm probably
overbraining this one.

See my usage below in sys_setitimer().

Index: kern_time.c
===
RCS file: /cvs/src/sys/kern/kern_time.c,v
retrieving revision 1.140
diff -u -p -r1.140 kern_time.c
--- kern_time.c 12 Aug 2020 15:31:27 -  1.140
+++ kern_time.c 17 Aug 2020 18:38:14 -
@@ -491,7 +491,7 @@ out:
 struct mutex itimer_mtx = MUTEX_INITIALIZER(IPL_CLOCK);
 
 /*
- * Get value of an interval timer.  The process virtual and
+ * Get and/or set value of an interval timer.  The process virtual and
  * profiling virtual time timers are kept internally in the
  * way they are specified externally: in time until they expire.
  *
@@ -509,6 +509,63 @@ struct mutex itimer_mtx = MUTEX_INITIALI
  * real time timers .it_interval.  Rather, we compute the next time in
  * absolute time the timer should go off.
  */
+void
+setitimer(int which, struct itimerval *itv, struct itimerval *olditv)
+{
+   struct itimerspec its, oldits;
+   struct itimerspec *itimer;
+   struct process *pr;
+   int timo;
+
+   KASSERT(which >= ITIMER_REAL && which <= ITIMER_PROF);
+
+   pr = curproc->p_p;
+   itimer = >ps_timer[which];
+
+   if (itv != NULL) {
+   TIMEVAL_TO_TIMESPEC(>it_value, _value);
+   TIMEVAL_TO_TIMESPEC(>it_interval, _interval);
+   }
+
+   if (which != ITIMER_REAL)
+   mtx_enter(_mtx);
+
+   if (olditv != NULL)
+   oldits = *itimer;
+   if (itv != NULL) {
+   if (which == ITIMER_REAL) {
+   struct timespec cts;
+   getnanouptime();
+   if (timespecisset(_value)) {
+   timo = tstohz(_value);
+   timeout_add(>ps_realit_to, timo);
+   timespecadd(_value, , _value);
+   } else
+   timeout_del(>ps_realit_to);
+   }
+   *itimer = its;
+   }
+
+   if (which != ITIMER_REAL)
+   mtx_leave(_mtx);
+
+   if (olditv != NULL) {
+   if (which == ITIMER_REAL) {
+   struct timespec now;
+   getnanouptime();
+   if (timespecisset(_value)) {
+   if (timespeccmp(_value, , <))
+   timespecclear(_value);
+   else
+   timespecsub(_value, ,
+   _value);
+   }
+   }
+   TIMESPEC_TO_TIMEVAL(>it_value, _value);
+   TIMESPEC_TO_TIMEVAL(>it_interval, _interval);
+   }
+}
+
 int
 sys_getitimer(struct proc *p, void *v, register_t *retval)
 {
@@ -516,44 +573,16 @@ sys_getitimer(struct proc *p, void *v, r
syscallarg(int) which;
syscallarg(struct itimerval *) itv;
} */ *uap = v;
-   struct itimerspec its;
struct itimerval aitv;
-   struct itimerspec *itimer;
int which;
 
which = SCARG(uap, which);
 
if (which < ITIMER_REAL || which > ITIMER_PROF)
return (EINVAL);
-   itimer = >p_p->ps_timer[which];
memset(, 0, sizeof(aitv));
 
-   if (which != ITIMER_REAL)
-   mtx_enter(_mtx);
-   its = *itimer;
-   if (which != ITIMER_REAL)
-   mtx_leave(_mtx);
-
-   if (which == ITIMER_REAL) {
-   struct timespec now;
-
-   getnanouptime();
-   /*
-* Convert from absolute to relative time in .it_value
-* part of real time timer.  If time for real time timer
-* has passed return 0, else return difference between
-* current time and time for the timer to go off.
-*/
-   if (timespecisset(_value)) {
-   if (timespeccmp(_value, , <))
-   timespecclear(_value);
- 

Re: getitimer(2), setitimer(2): merge critical sections

2020-08-14 Thread Scott Cheloha
On Wed, Aug 12, 2020 at 01:58:08PM -0500, Scott Cheloha wrote:
> 
> [...]
> 
> There is one behavior change: in the setitimer(2) swap case it is now
> possible to EFAULT on copyout(9) *after* you have written the new
> timer value and (possibly) started the ITIMER_REAL timeout.
> 
> For example, the following code now yields EFAULT even though a new
> oneshot timer has been started successfully.
> 
> [...]
> 
> I don't think there is a way to avoid this without introducing a bunch
> of extra complexity.  [...]

It has occurred to me that we could do a trial copyout(9) in
sys_setitimer() before entering the critical section.  This is a *bit*
wasteful, but is relatively inexpensive and narrows the behavior
change I mentioned down to truly improbable cases involving multiple
threads and munmap(2).

Updated patch below.

Index: kern_time.c
===
RCS file: /cvs/src/sys/kern/kern_time.c,v
retrieving revision 1.140
diff -u -p -r1.140 kern_time.c
--- kern_time.c 12 Aug 2020 15:31:27 -  1.140
+++ kern_time.c 14 Aug 2020 23:59:23 -
@@ -491,7 +491,7 @@ out:
 struct mutex itimer_mtx = MUTEX_INITIALIZER(IPL_CLOCK);
 
 /*
- * Get value of an interval timer.  The process virtual and
+ * Get and/or set value of an interval timer.  The process virtual and
  * profiling virtual time timers are kept internally in the
  * way they are specified externally: in time until they expire.
  *
@@ -509,6 +509,63 @@ struct mutex itimer_mtx = MUTEX_INITIALI
  * real time timers .it_interval.  Rather, we compute the next time in
  * absolute time the timer should go off.
  */
+void
+setitimer(int which, struct itimerval *itv, struct itimerval *olditv)
+{
+   struct itimerspec its, oldits;
+   struct itimerspec *itimer;
+   struct process *pr;
+   int timo;
+
+   KASSERT(which >= ITIMER_REAL && which <= ITIMER_PROF);
+
+   pr = curproc->p_p;
+   itimer = >ps_timer[which];
+
+   if (itv != NULL) {
+   TIMEVAL_TO_TIMESPEC(>it_value, _value);
+   TIMEVAL_TO_TIMESPEC(>it_interval, _interval);
+   }
+
+   if (which != ITIMER_REAL)
+   mtx_enter(_mtx);
+
+   if (olditv != NULL)
+   oldits = *itimer;
+   if (itv != NULL) {
+   if (which == ITIMER_REAL) {
+   struct timespec cts;
+   getnanouptime();
+   if (timespecisset(_value)) {
+   timo = tstohz(_value);
+   timeout_add(>ps_realit_to, timo);
+   timespecadd(_value, , _value);
+   } else
+   timeout_del(>ps_realit_to);
+   }
+   *itimer = its;
+   }
+
+   if (which != ITIMER_REAL)
+   mtx_leave(_mtx);
+
+   if (olditv != NULL) {
+   if (which == ITIMER_REAL) {
+   struct timespec now;
+   getnanouptime();
+   if (timespecisset(_value)) {
+   if (timespeccmp(_value, , <))
+   timespecclear(_value);
+   else
+   timespecsub(_value, ,
+   _value);
+   }
+   }
+   TIMESPEC_TO_TIMEVAL(>it_value, _value);
+   TIMESPEC_TO_TIMEVAL(>it_interval, _interval);
+   }
+}
+
 int
 sys_getitimer(struct proc *p, void *v, register_t *retval)
 {
@@ -516,44 +573,16 @@ sys_getitimer(struct proc *p, void *v, r
syscallarg(int) which;
syscallarg(struct itimerval *) itv;
} */ *uap = v;
-   struct itimerspec its;
struct itimerval aitv;
-   struct itimerspec *itimer;
int which;
 
which = SCARG(uap, which);
 
if (which < ITIMER_REAL || which > ITIMER_PROF)
return (EINVAL);
-   itimer = >p_p->ps_timer[which];
memset(, 0, sizeof(aitv));
 
-   if (which != ITIMER_REAL)
-   mtx_enter(_mtx);
-   its = *itimer;
-   if (which != ITIMER_REAL)
-   mtx_leave(_mtx);
-
-   if (which == ITIMER_REAL) {
-   struct timespec now;
-
-   getnanouptime();
-   /*
-* Convert from absolute to relative time in .it_value
-* part of real time timer.  If time for real time timer
-* has passed return 0, else return difference between
-* current time and time for the timer to go off.
-*/
-   if (timespecisset(_value)) {
-   if (timespeccmp(_value, , <))
-   timespecclear(_value);
-   els

getitimer(2), setitimer(2): merge critical sections

2020-08-12 Thread Scott Cheloha
Hi,

Things in getitimer(2) and setitimer(2) have been rearranged
adequately.  Their critical sections are ready to be combined.

Merging these critical sections is necessary to make getitimer(2) and
setitimer(2) MP-safe.  They are not ready to run without the kernel
lock just yet, but this diff is a prerequisite.  Everything up until
now was done to make this patch less painful.

So, this patch:

We introduce a new kernel subroutine, "setitimer()", that does all of
the common, error-free work for both getitimer(2) and setitimer(2).
The high-level steps are as follows:

 - convert input from itimerval to itimerspec
 - enter the critical section
 - read the timer's current value
 - (ITIMER_REAL) do timeout_add(9)/timeout_del(9)
 - (ITIMER_REAL) convert input from relative to absolute time
 - write the timer's new value
 - leave the critical section
 - (ITIMER_REAL) convert output from absolute to relative time
 - convert output from itimerspec to itimerval

All of this code has been moved more-or-less verbatim from
sys_getitimer() and sys_setitimer() and interleaved within the
new subroutine around a single critical section.

Meanwhile, sys_getitimer() and sys_setitimer() are left to handle all
of the error-prone work: copyin(9), input validation, and copyout(9).

The changes in sys_getitimer() are straightforward.  All of its common
code folds neatly into the new subroutine without any changes to the
surrounding code.

sys_setitimer() is trickier because it doesn't use SCARG directly.
I've introduced additional itimerval pointers to keep changes minimal
here.  However, I think it would benefit from the direct use of SCARG
to distinguish userspace addresses from kernel stack addresses.  That
can wait until later, though.

sys_setitimer() now performs its own copyout(9) instead of relying on
sys_getitimer() to do it implicitly.  This adds a bit of additional
code but I would rather see the syscall do copyout(9) explicitly.

There is one behavior change: in the setitimer(2) swap case it is now
possible to EFAULT on copyout(9) *after* you have written the new
timer value and (possibly) started the ITIMER_REAL timeout.

For example, the following code now yields EFAULT even though a new
oneshot timer has been started successfully.

struct itimerval new;
int error;

new.it_value.tv_sec = 1;
new.it_value.tv_usec = 0;
timerclear(_interval);
error = setitimer(ITIMER_REAL, , 0xdeadbeef);
if (error)
warn("setitimer");

I don't think there is a way to avoid this without introducing a bunch
of extra complexity.  The critical section is protected by a mutex and
copyout(9) can sleep, so we have to wait until we leave the critical
section to copyout(9).  If we leave the mutex to do the copyout(9)
before writing the new timer value then the swap is no longer atomic.
Of course, this is not an issue *now*, but when the syscalls are
unlocked you will lose atomicity.

Personally I don't think this is a huge deal.  If you're getting
EFAULT there is a bigger problem in your code.

Thoughts?  ok?

Index: kern_time.c
===
RCS file: /cvs/src/sys/kern/kern_time.c,v
retrieving revision 1.140
diff -u -p -r1.140 kern_time.c
--- kern_time.c 12 Aug 2020 15:31:27 -  1.140
+++ kern_time.c 12 Aug 2020 18:44:08 -
@@ -491,7 +491,7 @@ out:
 struct mutex itimer_mtx = MUTEX_INITIALIZER(IPL_CLOCK);
 
 /*
- * Get value of an interval timer.  The process virtual and
+ * Get and/or set value of an interval timer.  The process virtual and
  * profiling virtual time timers are kept internally in the
  * way they are specified externally: in time until they expire.
  *
@@ -509,6 +509,63 @@ struct mutex itimer_mtx = MUTEX_INITIALI
  * real time timers .it_interval.  Rather, we compute the next time in
  * absolute time the timer should go off.
  */
+void
+setitimer(int which, struct itimerval *itv, struct itimerval *olditv)
+{
+   struct itimerspec its, oldits;
+   struct itimerspec *itimer;
+   struct process *pr;
+   int timo;
+
+   KASSERT(which >= ITIMER_REAL && which <= ITIMER_PROF);
+
+   pr = curproc->p_p;
+   itimer = >ps_timer[which];
+
+   if (itv != NULL) {
+   TIMEVAL_TO_TIMESPEC(>it_value, _value);
+   TIMEVAL_TO_TIMESPEC(>it_interval, _interval);
+   }
+
+   if (which != ITIMER_REAL)
+   mtx_enter(_mtx);
+
+   if (olditv != NULL)
+   oldits = *itimer;
+   if (itv != NULL) {
+   if (which == ITIMER_REAL) {
+   struct timespec cts;
+   getnanouptime();
+   if (timespecisset(_value)) {
+   timo = tstohz(_value);
+   timeout_add(>ps_realit_to, timo);
+   timespecadd(_value, , _value);
+   } else
+   

setitimer(2): write new timer value once

2020-08-11 Thread Scott Cheloha
Hi,

Before merging the critical sections for setitimer(2) and getitimer(2)
we need to make their critical sections as similar as possible.

In getitimer(2) we read the timer value in one place in the code:

/* getitimer(2) pseudocode */
if (which != ITIMER_REAL)
mtx_enter(_mtx);

/* read the timer */

if (which != ITIMER_REAL)
mtx_leave(_mtx);

The corresponding write logic in setitimer(2) is laid out differently.
The code that actually performs the write is duplicated:

/* setitimer(2) pseudocode (as-is) */
if (which == ITIMER_REAL) {
/* ITIMER_REAL-specific stuff */
>>> pr->ps_timer[ITIMER_REAL] = aits;
} else {
mtx_enter(_mtx);
>>> pr->ps_timer[which] = aits;
mtx_leave(_mtx);
}

If we rearrange the setitimer(2) code to do the write in one place...

/* setitimer(2) pseudocode (rearranged) */
if (which != ITIMER_REAL)
mtx_enter(_mtx);

if (which == ITIMER_REAL) {
/* ITIMER_REAL-specific stuff */
}
>>> pr->ps_timer[which] = aits;

if (which != ITIMER_REAL)
mtx_leave(_mtx);

... merging the critical sections in a subsequent patch will be
easier.

ok?

Index: kern_time.c
===
RCS file: /cvs/src/sys/kern/kern_time.c,v
retrieving revision 1.137
diff -u -p -r1.137 kern_time.c
--- kern_time.c 11 Aug 2020 18:29:58 -  1.137
+++ kern_time.c 11 Aug 2020 20:43:32 -
@@ -597,6 +597,10 @@ sys_setitimer(struct proc *p, void *v, r
}
if (itvp == 0)
return (0);
+
+   if (which != ITIMER_REAL)
+   mtx_enter(_mtx);
+
if (which == ITIMER_REAL) {
struct timespec cts;
 
@@ -607,12 +611,11 @@ sys_setitimer(struct proc *p, void *v, r
timeout_add(>ps_realit_to, timo);
timespecadd(_value, , _value);
}
-   pr->ps_timer[ITIMER_REAL] = aits;
-   } else {
-   mtx_enter(_mtx);
-   pr->ps_timer[which] = aits;
-   mtx_leave(_mtx);
}
+   pr->ps_timer[which] = aits;
+
+   if (which != ITIMER_REAL)
+   mtx_leave(_mtx);
 
return (0);
 }



setitimer(2): consolidate copyin(9) and input validation

2020-08-11 Thread Scott Cheloha
Hi,

For what are probably historical reasons we don't validate the
setitimer(2) input until after we have (optionally called getitimer(2)
to copyout(9) the current timer value.

Consolidating the copyin(9), input validation, and input conversion
into a single block makes the setitimer(2) code easier to follow.  It
also makes combining the critical sections of setitimer(2) and
getitimer(2) in a future patch simpler.

This changes our behavior in the error path.  We now fail with EINVAL
if your input (itv) is bogus *before* modifying your output (olditv).
However, any program relying on this behavior is broken: the contents
of olditv are undefined in the event of an error.

While here, remove some unnecessary (void *) casts.

ok?

Index: kern_time.c
===
RCS file: /cvs/src/sys/kern/kern_time.c,v
retrieving revision 1.136
diff -u -p -r1.136 kern_time.c
--- kern_time.c 11 Aug 2020 15:41:50 -  1.136
+++ kern_time.c 11 Aug 2020 16:29:35 -
@@ -580,9 +580,15 @@ sys_setitimer(struct proc *p, void *v, r
if (which < ITIMER_REAL || which > ITIMER_PROF)
return (EINVAL);
itvp = SCARG(uap, itv);
-   if (itvp && (error = copyin((void *)itvp, (void *),
-   sizeof(struct itimerval
-   return (error);
+   if (itvp) {
+   error = copyin(itvp, , sizeof(struct itimerval));
+   if (error)
+   return (error);
+   if (itimerfix(_value) || itimerfix(_interval))
+   return (EINVAL);
+   TIMEVAL_TO_TIMESPEC(_value, _value);
+   TIMEVAL_TO_TIMESPEC(_interval, _interval);
+   }
if (oitv != NULL) {
SCARG(, which) = which;
SCARG(, itv) = oitv;
@@ -591,10 +597,6 @@ sys_setitimer(struct proc *p, void *v, r
}
if (itvp == 0)
return (0);
-   if (itimerfix(_value) || itimerfix(_interval))
-   return (EINVAL);
-   TIMEVAL_TO_TIMESPEC(_value, _value);
-   TIMEVAL_TO_TIMESPEC(_interval, _interval);
if (which == ITIMER_REAL) {
struct timespec cts;
 



Re: process: annotate locking for setitimer(2) state

2020-08-09 Thread Scott Cheloha
On Sun, Aug 09, 2020 at 04:43:24PM +0200, Mark Kettenis wrote:
> > Date: Sat, 8 Aug 2020 19:46:14 -0500
> > From: Scott Cheloha 
> > 
> > Hi,
> > 
> > I want to annotate the locking for the per-process interval timers.
> > 
> > In the process struct, the ITIMER_REAL itimerspec and the ps_itimer_to
> > timeout are protected by the kernel lock.  These should be annotated
> > with "K", right?
> > 
> > Also in the process struct, the ITIMER_VIRTUAL and ITIMER_PROF
> > itimerspecs are protected by the global itimer_mtx.
> > 
> > However, I don't think "itimer_mtx" isn't the best name for it, as it
> > doesn't protect state for *all* per-process interval timers.  Just the
> > virtual ones.
> > 
> > Could I rename the mutex to "virtual_itimer_mtx"?  Then I can annotate
> > the state protected by it with "V", as shown here in this patch.
> 
> That's quite a long variable name though.  And it also protects
> ITIMER_PROF.  So I'd say the name would be at least as misleading as
> the current one and perhaps even more so.  You can just use "I" as the
> annotation perhaps?

The convention is to use "I" for immutable variables.  We do it
everywhere.  I don't think we should buck convention here.

I also proposed using "i" in a prior patch to annotate these
variables, but mpi@ said it was too close to "I".  Also, it's a global
lock, and we have settled on only annotate global locks with capital
letters.

If you don't want to rename the mutex I guess we could use "T" for
"timer".  We use "T" for other global locks (tc_lock, timeout_mutex)
but not in this context.

However, there are only so many letters.  Eventually this scheme will
run afoul of that limitation.  An idea I had re. the letter shortage
was to use two letters where necessary.  So instead of "I" you could
use "It" for "itimer".  We annotate locking hierarchies with commas so
there isn't an ambiguity when reading it.

For example, if the code for writing a hypothetical "ps_foo" process
struct member was:

KERNEL_LOCK();
mtx_enter(_mtx);
ps.ps_foo = 10;
mtx_leave(_mtx);
KERNEL_UNLOCK();

You could annotate it like this:

/*
 * Locks used to protect process struct members:
 *
 *  It  itimer_mtx
 *  K   kernel lock
 */
struct process {
/* [...] */
int ps_foo; /* [K,It] per-process foobar */
/* [...] */
};

anton@, mpi@: is that too radical or easily misread?

Sorry if this all seems fussy, but I'd like to get this right the
first time.

-Scott



hardclock(9): fix race with setitimer(2)

2020-08-09 Thread Scott Cheloha
Hi,

We update the ITIMER_VIRTUAL and ITIMER_PROF per-process interval
timers from hardclock(9).  If a timer is enabled we call itimerdecr()
to update and reload it as needed.  If a timer has expired we then set
a flag on the current thread to signal itself when returning to
userspace.

However, there is a race here with setitimer(2).  In hardclock(9) we
check whether a timer is enabled *before* entering itimer_mtx in
itimerdecr(), but once we have entered the mutex we don't double-check
that the timer is still enabled.  This is wrong.  Another thread may
have disabled the timer via setitimer(2) while we were entering the
mutex.

This patch adds the second check to itimerdecr().  If we lost the race
and the timer is disabled we return 1 to indicate that the timer has
not expired, i.e. that the thread should take no action.

ok?

Index: kern_time.c
===
RCS file: /cvs/src/sys/kern/kern_time.c,v
retrieving revision 1.134
diff -u -p -r1.134 kern_time.c
--- kern_time.c 8 Aug 2020 01:01:26 -   1.134
+++ kern_time.c 9 Aug 2020 11:47:02 -
@@ -682,6 +682,20 @@ itimerdecr(struct itimerspec *itp, long 
NSEC_TO_TIMESPEC(nsec, );
 
mtx_enter(_mtx);
+
+   /*
+* Double-check that the timer is enabled.  We may have lost
+* a race with another thread in setitimer(2) when entering
+* itimer_mtx.
+*/
+   if (!timespecisset(>it_value)) {
+   mtx_leave(_mtx);
+   return (1);
+   }
+
+   /*
+* The timer is enabled.  Update and reload it as needed.
+*/
timespecsub(>it_value, , >it_value);
if (itp->it_value.tv_sec >= 0 && timespecisset(>it_value)) {
mtx_leave(_mtx);



  1   2   3   4   5   >