Re: getitimer(2), setitimer(2): merge critical sections

2020-09-25 Thread Scott Cheloha
On Tue, Sep 01, 2020 at 10:24:11AM -0500, Scott Cheloha wrote:
> On Mon, Aug 17, 2020 at 05:55:34PM -0500, Scott Cheloha wrote:
> > 
> > [...]
> 
> Two week bump.
> 
> In summary:
> 
> - Merge the critical sections so that "timer swap" with setitimer(2)
>   is atomic.
> 
> - To do this, move error-free operations into a common kernel
>   subroutine, setitimer().  Now we have one critical section.
> 
> - Leave error-prone operations in sys_getitimer() and sys_setitimer().
> 
> In order to make the "timer swap" atomic we leave the timer installed
> if the copyout(9) fails.  This isn't great, but it is unavoidable
> without permitting copyout(9) from within a mutex.  FreeBSD and Linux
> went this direction, too.  I would rather leave the timer running and
> have an atomic swap than race the hardclock(9) or the realitexpire()
> timeout.
> 
> [...]

6 week bump.  Diff attached again.  Basically nothing has changed.
We're moving the critical sections from sys_getitimer() and
sys_setitimer() into a new subroutine, setitimer(), and merging those
critical sections.

I'm confident that merging the critical sections is the right
direction to take setitimer(2).  It gets us closer to removing the
kernel lock and eliminates a race with hardclock(9).

In the near future we will need to call the setitimer() subroutine
during _exit(2) to safely cancel the ITIMER_REAL timeout so I think
putting the critical section into a subroutine we can call from
outside of kern_time.c is also the right thing to do.

I'm going to wait until after the tree is unlocked to commit this, as
the behavior change in the timer swap case *might* break some software
out there that isn't checking the setitimer(2) return code.  It's
unlikely, but it's possible, so I'll wait.

CC: everyone I've ever talked to about setitimer(2).

Index: kern_time.c
===
RCS file: /cvs/src/sys/kern/kern_time.c,v
retrieving revision 1.140
diff -u -p -r1.140 kern_time.c
--- kern_time.c 12 Aug 2020 15:31:27 -  1.140
+++ kern_time.c 25 Sep 2020 23:32:08 -
@@ -491,7 +491,7 @@ out:
 struct mutex itimer_mtx = MUTEX_INITIALIZER(IPL_CLOCK);
 
 /*
- * Get value of an interval timer.  The process virtual and
+ * Get or set value of an interval timer.  The process virtual and
  * profiling virtual time timers are kept internally in the
  * way they are specified externally: in time until they expire.
  *
@@ -509,6 +509,62 @@ struct mutex itimer_mtx = MUTEX_INITIALI
  * real time timers .it_interval.  Rather, we compute the next time in
  * absolute time the timer should go off.
  */
+void
+setitimer(int which, const struct itimerval *itv, struct itimerval *olditv)
+{
+   struct itimerspec its, oldits;
+   struct itimerspec *itimer;
+   struct process *pr;
+   int timo;
+
+   KASSERT(which >= ITIMER_REAL && which <= ITIMER_PROF);
+
+   pr = curproc->p_p;
+   itimer = &pr->ps_timer[which];
+
+   if (itv != NULL) {
+   TIMEVAL_TO_TIMESPEC(&itv->it_value, &its.it_value);
+   TIMEVAL_TO_TIMESPEC(&itv->it_interval, &its.it_interval);
+   }
+
+   if (which != ITIMER_REAL)
+   mtx_enter(&itimer_mtx);
+
+   if (olditv != NULL)
+   oldits = *itimer;
+   if (itv != NULL) {
+   if (which == ITIMER_REAL) {
+   struct timespec cts;
+   getnanouptime(&cts);
+   if (timespecisset(&its.it_value)) {
+   timo = tstohz(&its.it_value);
+   timeout_add(&pr->ps_realit_to, timo);
+   timespecadd(&its.it_value, &cts, &its.it_value);
+   } else
+   timeout_del(&pr->ps_realit_to);
+   }
+   *itimer = its;
+   }
+
+   if (which != ITIMER_REAL)
+   mtx_leave(&itimer_mtx);
+
+   if (olditv != NULL) {
+   if (which == ITIMER_REAL && timespecisset(&oldits.it_value)) {
+   struct timespec now;
+   getnanouptime(&now);
+   if (timespeccmp(&oldits.it_value, &now, <))
+   timespecclear(&oldits.it_value);
+   else {
+   timespecsub(&oldits.it_value, &now,
+   &oldits.it_value);
+   }
+   }
+   TIMESPEC_TO_TIMEVAL(&olditv->it_value, &oldits.it_value);
+   TIMESPEC_TO_TIMEVAL(&olditv->it_interval, &oldits.it_interval);
+   }
+}
+
 int
 sys_getitimer(struct proc *p, void *v, register_t *retval)
 {
@@ -516,44 +572,16 @@ sys_getitimer(struct proc *p, void *v, r
syscallarg(int) which;
syscallarg(struct itimerval *) itv;
} */ *uap = v;
-   struct itimerspec its;
struct itimerval aitv;
-   struct itimerspec *itimer;
   

Re: getitimer(2), setitimer(2): merge critical sections

2020-09-01 Thread Scott Cheloha
On Mon, Aug 17, 2020 at 05:55:34PM -0500, Scott Cheloha wrote:
> 
> [...]

Two week bump.

In summary:

- Merge the critical sections so that "timer swap" with setitimer(2)
  is atomic.

- To do this, move error-free operations into a common kernel
  subroutine, setitimer().  Now we have one critical section.

- Leave error-prone operations in sys_getitimer() and sys_setitimer().

In order to make the "timer swap" atomic we leave the timer installed
if the copyout(9) fails.  This isn't great, but it is unavoidable
without permitting copyout(9) from within a mutex.  FreeBSD and Linux
went this direction, too.  I would rather leave the timer running and
have an atomic swap than race the hardclock(9) or the realitexpire()
timeout.

ok?

Index: kern_time.c
===
RCS file: /cvs/src/sys/kern/kern_time.c,v
retrieving revision 1.140
diff -u -p -r1.140 kern_time.c
--- kern_time.c 12 Aug 2020 15:31:27 -  1.140
+++ kern_time.c 1 Sep 2020 15:19:07 -
@@ -491,7 +491,7 @@ out:
 struct mutex itimer_mtx = MUTEX_INITIALIZER(IPL_CLOCK);
 
 /*
- * Get value of an interval timer.  The process virtual and
+ * Get and/or set value of an interval timer.  The process virtual and
  * profiling virtual time timers are kept internally in the
  * way they are specified externally: in time until they expire.
  *
@@ -509,6 +509,63 @@ struct mutex itimer_mtx = MUTEX_INITIALI
  * real time timers .it_interval.  Rather, we compute the next time in
  * absolute time the timer should go off.
  */
+void
+setitimer(int which, struct itimerval *itv, struct itimerval *olditv)
+{
+   struct itimerspec its, oldits;
+   struct itimerspec *itimer;
+   struct process *pr;
+   int timo;
+
+   KASSERT(which >= ITIMER_REAL && which <= ITIMER_PROF);
+
+   pr = curproc->p_p;
+   itimer = &pr->ps_timer[which];
+
+   if (itv != NULL) {
+   TIMEVAL_TO_TIMESPEC(&itv->it_value, &its.it_value);
+   TIMEVAL_TO_TIMESPEC(&itv->it_interval, &its.it_interval);
+   }
+
+   if (which != ITIMER_REAL)
+   mtx_enter(&itimer_mtx);
+
+   if (olditv != NULL)
+   oldits = *itimer;
+   if (itv != NULL) {
+   if (which == ITIMER_REAL) {
+   struct timespec cts;
+   getnanouptime(&cts);
+   if (timespecisset(&its.it_value)) {
+   timo = tstohz(&its.it_value);
+   timeout_add(&pr->ps_realit_to, timo);
+   timespecadd(&its.it_value, &cts, &its.it_value);
+   } else
+   timeout_del(&pr->ps_realit_to);
+   }
+   *itimer = its;
+   }
+
+   if (which != ITIMER_REAL)
+   mtx_leave(&itimer_mtx);
+
+   if (olditv != NULL) {
+   if (which == ITIMER_REAL) {
+   struct timespec now;
+   getnanouptime(&now);
+   if (timespecisset(&oldits.it_value)) {
+   if (timespeccmp(&oldits.it_value, &now, <))
+   timespecclear(&oldits.it_value);
+   else
+   timespecsub(&oldits.it_value, &now,
+   &oldits.it_value);
+   }
+   }
+   TIMESPEC_TO_TIMEVAL(&olditv->it_value, &oldits.it_value);
+   TIMESPEC_TO_TIMEVAL(&olditv->it_interval, &oldits.it_interval);
+   }
+}
+
 int
 sys_getitimer(struct proc *p, void *v, register_t *retval)
 {
@@ -516,44 +573,16 @@ sys_getitimer(struct proc *p, void *v, r
syscallarg(int) which;
syscallarg(struct itimerval *) itv;
} */ *uap = v;
-   struct itimerspec its;
struct itimerval aitv;
-   struct itimerspec *itimer;
int which;
 
which = SCARG(uap, which);
 
if (which < ITIMER_REAL || which > ITIMER_PROF)
return (EINVAL);
-   itimer = &p->p_p->ps_timer[which];
memset(&aitv, 0, sizeof(aitv));
 
-   if (which != ITIMER_REAL)
-   mtx_enter(&itimer_mtx);
-   its = *itimer;
-   if (which != ITIMER_REAL)
-   mtx_leave(&itimer_mtx);
-
-   if (which == ITIMER_REAL) {
-   struct timespec now;
-
-   getnanouptime(&now);
-   /*
-* Convert from absolute to relative time in .it_value
-* part of real time timer.  If time for real time timer
-* has passed return 0, else return difference between
-* current time and time for the timer to go off.
-*/
-   if (timespecisset(&its.it_value)) {
-   if (timespeccmp(&its.it_value, &now, <))
-   timespecclear(&its.it_valu

Re: getitimer(2), setitimer(2): merge critical sections

2020-08-17 Thread Scott Cheloha
On Mon, Aug 17, 2020 at 12:57:33PM -0600, Theo de Raadt wrote:
> Scott Cheloha  wrote:
> 
> > There is one behavior change: in the setitimer(2) swap case it is now
> > possible to EFAULT on copyout(9) *after* you have written the new
> > timer value and (possibly) started the ITIMER_REAL timeout.
> > 
> > For example, the following code now yields EFAULT even though a new
> > oneshot timer has been started successfully.
> > 
> > struct itimerval new;
> > int error;
> > 
> > new.it_value.tv_sec = 1;
> > new.it_value.tv_usec = 0;
> > timerclear(&new.it_interval);
> > error = setitimer(ITIMER_REAL, &new, 0xdeadbeef);
> > if (error)
> > warn("setitimer");
> > 
> > I don't think there is a way to avoid this without introducing a bunch
> > of extra complexity.  The critical section is protected by a mutex and
> > copyout(9) can sleep, so we have to wait until we leave the critical
> > section to copyout(9).  If we leave the mutex to do the copyout(9)
> > before writing the new timer value then the swap is no longer atomic.
> > Of course, this is not an issue *now*, but when the syscalls are
> > unlocked you will lose atomicity.
> > 
> > Personally I don't think this is a huge deal.  If you're getting
> > EFAULT there is a bigger problem in your code.
> 
> Let's go back to this first mail.
> 
> I suspect it is OK to update the timout, even if the final copyout (and
> syscall) then returns EFAULT.
> 
> It looks like historical 4.4BSD was "copyout then update".  FreeBSD is
> "update them copyout".
> 
> I certainly don't think it is worthwhile creating a problem which
> is somewhat similar to a TOCTOU.  Even your proposal to do the
> address-space range check is a TOCTOU, it fixes nothing since the
> address space can still be flipped).
> 
> What do other systems do?

FreeBSD and Linux do "update then copyout".  They don't do any cleanup
if the copyout fails, i.e. the new timer is left running.

DragonflyBSD, NetBSD, and illumos/Solaris do "copyout then update".
They have separate critical sections for reading and writing the
timer.  This means the timer is *not* installed if there is a copyout
error but that the "swap" is not atomic: there is a race after you
read the timer where the timeout (or hardclock(9)) can update the
timer after you have read it.

Near as I can tell you can't have it both ways without permitting
copyout from within the critical section.

Assuming that isn't allowed, I would rather have a single critical
section.

Index: kern_time.c
===
RCS file: /cvs/src/sys/kern/kern_time.c,v
retrieving revision 1.140
diff -u -p -r1.140 kern_time.c
--- kern_time.c 12 Aug 2020 15:31:27 -  1.140
+++ kern_time.c 17 Aug 2020 22:47:48 -
@@ -491,7 +491,7 @@ out:
 struct mutex itimer_mtx = MUTEX_INITIALIZER(IPL_CLOCK);
 
 /*
- * Get value of an interval timer.  The process virtual and
+ * Get and/or set value of an interval timer.  The process virtual and
  * profiling virtual time timers are kept internally in the
  * way they are specified externally: in time until they expire.
  *
@@ -509,6 +509,63 @@ struct mutex itimer_mtx = MUTEX_INITIALI
  * real time timers .it_interval.  Rather, we compute the next time in
  * absolute time the timer should go off.
  */
+void
+setitimer(int which, struct itimerval *itv, struct itimerval *olditv)
+{
+   struct itimerspec its, oldits;
+   struct itimerspec *itimer;
+   struct process *pr;
+   int timo;
+
+   KASSERT(which >= ITIMER_REAL && which <= ITIMER_PROF);
+
+   pr = curproc->p_p;
+   itimer = &pr->ps_timer[which];
+
+   if (itv != NULL) {
+   TIMEVAL_TO_TIMESPEC(&itv->it_value, &its.it_value);
+   TIMEVAL_TO_TIMESPEC(&itv->it_interval, &its.it_interval);
+   }
+
+   if (which != ITIMER_REAL)
+   mtx_enter(&itimer_mtx);
+
+   if (olditv != NULL)
+   oldits = *itimer;
+   if (itv != NULL) {
+   if (which == ITIMER_REAL) {
+   struct timespec cts;
+   getnanouptime(&cts);
+   if (timespecisset(&its.it_value)) {
+   timo = tstohz(&its.it_value);
+   timeout_add(&pr->ps_realit_to, timo);
+   timespecadd(&its.it_value, &cts, &its.it_value);
+   } else
+   timeout_del(&pr->ps_realit_to);
+   }
+   *itimer = its;
+   }
+
+   if (which != ITIMER_REAL)
+   mtx_leave(&itimer_mtx);
+
+   if (olditv != NULL) {
+   if (which == ITIMER_REAL) {
+   struct timespec now;
+   getnanouptime(&now);
+   if (timespecisset(&oldits.it_value)) {
+   if (timespeccmp(&oldits.it_value, &now, <))
+   timespe

Re: getitimer(2), setitimer(2): merge critical sections

2020-08-17 Thread Theo de Raadt
Scott Cheloha  wrote:

> There is one behavior change: in the setitimer(2) swap case it is now
> possible to EFAULT on copyout(9) *after* you have written the new
> timer value and (possibly) started the ITIMER_REAL timeout.
> 
> For example, the following code now yields EFAULT even though a new
> oneshot timer has been started successfully.
> 
>   struct itimerval new;
>   int error;
> 
>   new.it_value.tv_sec = 1;
>   new.it_value.tv_usec = 0;
>   timerclear(&new.it_interval);
>   error = setitimer(ITIMER_REAL, &new, 0xdeadbeef);
>   if (error)
>   warn("setitimer");
> 
> I don't think there is a way to avoid this without introducing a bunch
> of extra complexity.  The critical section is protected by a mutex and
> copyout(9) can sleep, so we have to wait until we leave the critical
> section to copyout(9).  If we leave the mutex to do the copyout(9)
> before writing the new timer value then the swap is no longer atomic.
> Of course, this is not an issue *now*, but when the syscalls are
> unlocked you will lose atomicity.
> 
> Personally I don't think this is a huge deal.  If you're getting
> EFAULT there is a bigger problem in your code.

Let's go back to this first mail.

I suspect it is OK to update the timout, even if the final copyout (and
syscall) then returns EFAULT.

It looks like historical 4.4BSD was "copyout then update".  FreeBSD is
"update them copyout".

I certainly don't think it is worthwhile creating a problem which
is somewhat similar to a TOCTOU.  Even your proposal to do the
address-space range check is a TOCTOU, it fixes nothing since the
address space can still be flipped).

What do other systems do?





Re: getitimer(2), setitimer(2): merge critical sections

2020-08-17 Thread Scott Cheloha
On Fri, Aug 14, 2020 at 06:11:25PM -0600, Theo de Raadt wrote:
> > It has occurred to me that we could do a trial copyout(9) in
> > sys_setitimer() before entering the critical section.  This is a *bit*
> > wasteful, but is relatively inexpensive and narrows the behavior
> > change I mentioned down to truly improbable cases involving multiple
> > threads and munmap(2).
> 
> That sounds scary.  You are touching userland memory twice, and I could
> imagine a situation where it isn't the same memory because it gets
> swapped out in a shared storage situation.
> 
> It sounds terribly wrong to do that.

Is there a way to check that a uaddr_t range is writable by a given
process without actually calling copyout(9) and checking for EFAULT?

I'm not familiar with uvm(9), but uvm_map_checkprot(9) seems to do
what I'm looking for.

Of course, no other syscalls use it for this purpose, so I'm probably
overbraining this one.

See my usage below in sys_setitimer().

Index: kern_time.c
===
RCS file: /cvs/src/sys/kern/kern_time.c,v
retrieving revision 1.140
diff -u -p -r1.140 kern_time.c
--- kern_time.c 12 Aug 2020 15:31:27 -  1.140
+++ kern_time.c 17 Aug 2020 18:38:14 -
@@ -491,7 +491,7 @@ out:
 struct mutex itimer_mtx = MUTEX_INITIALIZER(IPL_CLOCK);
 
 /*
- * Get value of an interval timer.  The process virtual and
+ * Get and/or set value of an interval timer.  The process virtual and
  * profiling virtual time timers are kept internally in the
  * way they are specified externally: in time until they expire.
  *
@@ -509,6 +509,63 @@ struct mutex itimer_mtx = MUTEX_INITIALI
  * real time timers .it_interval.  Rather, we compute the next time in
  * absolute time the timer should go off.
  */
+void
+setitimer(int which, struct itimerval *itv, struct itimerval *olditv)
+{
+   struct itimerspec its, oldits;
+   struct itimerspec *itimer;
+   struct process *pr;
+   int timo;
+
+   KASSERT(which >= ITIMER_REAL && which <= ITIMER_PROF);
+
+   pr = curproc->p_p;
+   itimer = &pr->ps_timer[which];
+
+   if (itv != NULL) {
+   TIMEVAL_TO_TIMESPEC(&itv->it_value, &its.it_value);
+   TIMEVAL_TO_TIMESPEC(&itv->it_interval, &its.it_interval);
+   }
+
+   if (which != ITIMER_REAL)
+   mtx_enter(&itimer_mtx);
+
+   if (olditv != NULL)
+   oldits = *itimer;
+   if (itv != NULL) {
+   if (which == ITIMER_REAL) {
+   struct timespec cts;
+   getnanouptime(&cts);
+   if (timespecisset(&its.it_value)) {
+   timo = tstohz(&its.it_value);
+   timeout_add(&pr->ps_realit_to, timo);
+   timespecadd(&its.it_value, &cts, &its.it_value);
+   } else
+   timeout_del(&pr->ps_realit_to);
+   }
+   *itimer = its;
+   }
+
+   if (which != ITIMER_REAL)
+   mtx_leave(&itimer_mtx);
+
+   if (olditv != NULL) {
+   if (which == ITIMER_REAL) {
+   struct timespec now;
+   getnanouptime(&now);
+   if (timespecisset(&oldits.it_value)) {
+   if (timespeccmp(&oldits.it_value, &now, <))
+   timespecclear(&oldits.it_value);
+   else
+   timespecsub(&oldits.it_value, &now,
+   &oldits.it_value);
+   }
+   }
+   TIMESPEC_TO_TIMEVAL(&olditv->it_value, &oldits.it_value);
+   TIMESPEC_TO_TIMEVAL(&olditv->it_interval, &oldits.it_interval);
+   }
+}
+
 int
 sys_getitimer(struct proc *p, void *v, register_t *retval)
 {
@@ -516,44 +573,16 @@ sys_getitimer(struct proc *p, void *v, r
syscallarg(int) which;
syscallarg(struct itimerval *) itv;
} */ *uap = v;
-   struct itimerspec its;
struct itimerval aitv;
-   struct itimerspec *itimer;
int which;
 
which = SCARG(uap, which);
 
if (which < ITIMER_REAL || which > ITIMER_PROF)
return (EINVAL);
-   itimer = &p->p_p->ps_timer[which];
memset(&aitv, 0, sizeof(aitv));
 
-   if (which != ITIMER_REAL)
-   mtx_enter(&itimer_mtx);
-   its = *itimer;
-   if (which != ITIMER_REAL)
-   mtx_leave(&itimer_mtx);
-
-   if (which == ITIMER_REAL) {
-   struct timespec now;
-
-   getnanouptime(&now);
-   /*
-* Convert from absolute to relative time in .it_value
-* part of real time timer.  If time for real time timer
-* has passed return 0, else return difference between
-* current time and 

Re: getitimer(2), setitimer(2): merge critical sections

2020-08-14 Thread Theo de Raadt
> It has occurred to me that we could do a trial copyout(9) in
> sys_setitimer() before entering the critical section.  This is a *bit*
> wasteful, but is relatively inexpensive and narrows the behavior
> change I mentioned down to truly improbable cases involving multiple
> threads and munmap(2).

That sounds scary.  You are touching userland memory twice, and I could
imagine a situation where it isn't the same memory because it gets
swapped out in a shared storage situation.

It sounds terribly wrong to do that.



Re: getitimer(2), setitimer(2): merge critical sections

2020-08-14 Thread Scott Cheloha
On Wed, Aug 12, 2020 at 01:58:08PM -0500, Scott Cheloha wrote:
> 
> [...]
> 
> There is one behavior change: in the setitimer(2) swap case it is now
> possible to EFAULT on copyout(9) *after* you have written the new
> timer value and (possibly) started the ITIMER_REAL timeout.
> 
> For example, the following code now yields EFAULT even though a new
> oneshot timer has been started successfully.
> 
> [...]
> 
> I don't think there is a way to avoid this without introducing a bunch
> of extra complexity.  [...]

It has occurred to me that we could do a trial copyout(9) in
sys_setitimer() before entering the critical section.  This is a *bit*
wasteful, but is relatively inexpensive and narrows the behavior
change I mentioned down to truly improbable cases involving multiple
threads and munmap(2).

Updated patch below.

Index: kern_time.c
===
RCS file: /cvs/src/sys/kern/kern_time.c,v
retrieving revision 1.140
diff -u -p -r1.140 kern_time.c
--- kern_time.c 12 Aug 2020 15:31:27 -  1.140
+++ kern_time.c 14 Aug 2020 23:59:23 -
@@ -491,7 +491,7 @@ out:
 struct mutex itimer_mtx = MUTEX_INITIALIZER(IPL_CLOCK);
 
 /*
- * Get value of an interval timer.  The process virtual and
+ * Get and/or set value of an interval timer.  The process virtual and
  * profiling virtual time timers are kept internally in the
  * way they are specified externally: in time until they expire.
  *
@@ -509,6 +509,63 @@ struct mutex itimer_mtx = MUTEX_INITIALI
  * real time timers .it_interval.  Rather, we compute the next time in
  * absolute time the timer should go off.
  */
+void
+setitimer(int which, struct itimerval *itv, struct itimerval *olditv)
+{
+   struct itimerspec its, oldits;
+   struct itimerspec *itimer;
+   struct process *pr;
+   int timo;
+
+   KASSERT(which >= ITIMER_REAL && which <= ITIMER_PROF);
+
+   pr = curproc->p_p;
+   itimer = &pr->ps_timer[which];
+
+   if (itv != NULL) {
+   TIMEVAL_TO_TIMESPEC(&itv->it_value, &its.it_value);
+   TIMEVAL_TO_TIMESPEC(&itv->it_interval, &its.it_interval);
+   }
+
+   if (which != ITIMER_REAL)
+   mtx_enter(&itimer_mtx);
+
+   if (olditv != NULL)
+   oldits = *itimer;
+   if (itv != NULL) {
+   if (which == ITIMER_REAL) {
+   struct timespec cts;
+   getnanouptime(&cts);
+   if (timespecisset(&its.it_value)) {
+   timo = tstohz(&its.it_value);
+   timeout_add(&pr->ps_realit_to, timo);
+   timespecadd(&its.it_value, &cts, &its.it_value);
+   } else
+   timeout_del(&pr->ps_realit_to);
+   }
+   *itimer = its;
+   }
+
+   if (which != ITIMER_REAL)
+   mtx_leave(&itimer_mtx);
+
+   if (olditv != NULL) {
+   if (which == ITIMER_REAL) {
+   struct timespec now;
+   getnanouptime(&now);
+   if (timespecisset(&oldits.it_value)) {
+   if (timespeccmp(&oldits.it_value, &now, <))
+   timespecclear(&oldits.it_value);
+   else
+   timespecsub(&oldits.it_value, &now,
+   &oldits.it_value);
+   }
+   }
+   TIMESPEC_TO_TIMEVAL(&olditv->it_value, &oldits.it_value);
+   TIMESPEC_TO_TIMEVAL(&olditv->it_interval, &oldits.it_interval);
+   }
+}
+
 int
 sys_getitimer(struct proc *p, void *v, register_t *retval)
 {
@@ -516,44 +573,16 @@ sys_getitimer(struct proc *p, void *v, r
syscallarg(int) which;
syscallarg(struct itimerval *) itv;
} */ *uap = v;
-   struct itimerspec its;
struct itimerval aitv;
-   struct itimerspec *itimer;
int which;
 
which = SCARG(uap, which);
 
if (which < ITIMER_REAL || which > ITIMER_PROF)
return (EINVAL);
-   itimer = &p->p_p->ps_timer[which];
memset(&aitv, 0, sizeof(aitv));
 
-   if (which != ITIMER_REAL)
-   mtx_enter(&itimer_mtx);
-   its = *itimer;
-   if (which != ITIMER_REAL)
-   mtx_leave(&itimer_mtx);
-
-   if (which == ITIMER_REAL) {
-   struct timespec now;
-
-   getnanouptime(&now);
-   /*
-* Convert from absolute to relative time in .it_value
-* part of real time timer.  If time for real time timer
-* has passed return 0, else return difference between
-* current time and time for the timer to go off.
-*/
-   if (timespecisset(&its.it_value)) {
-   if (timespecc

getitimer(2), setitimer(2): merge critical sections

2020-08-12 Thread Scott Cheloha
Hi,

Things in getitimer(2) and setitimer(2) have been rearranged
adequately.  Their critical sections are ready to be combined.

Merging these critical sections is necessary to make getitimer(2) and
setitimer(2) MP-safe.  They are not ready to run without the kernel
lock just yet, but this diff is a prerequisite.  Everything up until
now was done to make this patch less painful.

So, this patch:

We introduce a new kernel subroutine, "setitimer()", that does all of
the common, error-free work for both getitimer(2) and setitimer(2).
The high-level steps are as follows:

 - convert input from itimerval to itimerspec
 - enter the critical section
 - read the timer's current value
 - (ITIMER_REAL) do timeout_add(9)/timeout_del(9)
 - (ITIMER_REAL) convert input from relative to absolute time
 - write the timer's new value
 - leave the critical section
 - (ITIMER_REAL) convert output from absolute to relative time
 - convert output from itimerspec to itimerval

All of this code has been moved more-or-less verbatim from
sys_getitimer() and sys_setitimer() and interleaved within the
new subroutine around a single critical section.

Meanwhile, sys_getitimer() and sys_setitimer() are left to handle all
of the error-prone work: copyin(9), input validation, and copyout(9).

The changes in sys_getitimer() are straightforward.  All of its common
code folds neatly into the new subroutine without any changes to the
surrounding code.

sys_setitimer() is trickier because it doesn't use SCARG directly.
I've introduced additional itimerval pointers to keep changes minimal
here.  However, I think it would benefit from the direct use of SCARG
to distinguish userspace addresses from kernel stack addresses.  That
can wait until later, though.

sys_setitimer() now performs its own copyout(9) instead of relying on
sys_getitimer() to do it implicitly.  This adds a bit of additional
code but I would rather see the syscall do copyout(9) explicitly.

There is one behavior change: in the setitimer(2) swap case it is now
possible to EFAULT on copyout(9) *after* you have written the new
timer value and (possibly) started the ITIMER_REAL timeout.

For example, the following code now yields EFAULT even though a new
oneshot timer has been started successfully.

struct itimerval new;
int error;

new.it_value.tv_sec = 1;
new.it_value.tv_usec = 0;
timerclear(&new.it_interval);
error = setitimer(ITIMER_REAL, &new, 0xdeadbeef);
if (error)
warn("setitimer");

I don't think there is a way to avoid this without introducing a bunch
of extra complexity.  The critical section is protected by a mutex and
copyout(9) can sleep, so we have to wait until we leave the critical
section to copyout(9).  If we leave the mutex to do the copyout(9)
before writing the new timer value then the swap is no longer atomic.
Of course, this is not an issue *now*, but when the syscalls are
unlocked you will lose atomicity.

Personally I don't think this is a huge deal.  If you're getting
EFAULT there is a bigger problem in your code.

Thoughts?  ok?

Index: kern_time.c
===
RCS file: /cvs/src/sys/kern/kern_time.c,v
retrieving revision 1.140
diff -u -p -r1.140 kern_time.c
--- kern_time.c 12 Aug 2020 15:31:27 -  1.140
+++ kern_time.c 12 Aug 2020 18:44:08 -
@@ -491,7 +491,7 @@ out:
 struct mutex itimer_mtx = MUTEX_INITIALIZER(IPL_CLOCK);
 
 /*
- * Get value of an interval timer.  The process virtual and
+ * Get and/or set value of an interval timer.  The process virtual and
  * profiling virtual time timers are kept internally in the
  * way they are specified externally: in time until they expire.
  *
@@ -509,6 +509,63 @@ struct mutex itimer_mtx = MUTEX_INITIALI
  * real time timers .it_interval.  Rather, we compute the next time in
  * absolute time the timer should go off.
  */
+void
+setitimer(int which, struct itimerval *itv, struct itimerval *olditv)
+{
+   struct itimerspec its, oldits;
+   struct itimerspec *itimer;
+   struct process *pr;
+   int timo;
+
+   KASSERT(which >= ITIMER_REAL && which <= ITIMER_PROF);
+
+   pr = curproc->p_p;
+   itimer = &pr->ps_timer[which];
+
+   if (itv != NULL) {
+   TIMEVAL_TO_TIMESPEC(&itv->it_value, &its.it_value);
+   TIMEVAL_TO_TIMESPEC(&itv->it_interval, &its.it_interval);
+   }
+
+   if (which != ITIMER_REAL)
+   mtx_enter(&itimer_mtx);
+
+   if (olditv != NULL)
+   oldits = *itimer;
+   if (itv != NULL) {
+   if (which == ITIMER_REAL) {
+   struct timespec cts;
+   getnanouptime(&cts);
+   if (timespecisset(&its.it_value)) {
+   timo = tstohz(&its.it_value);
+   timeout_add(&pr->ps_realit_to, timo);
+   timespecadd(&its.it_value, &cts