On Wed, Jul 15, 2020 at 09:08:29AM -0500, Scott Cheloha wrote:

> Hi,
> 
> adjtime(2) skews the clock at up to 5000ppm per second.  The way this
> actually happens is pretty straightforward: at the start of every UTC
> second we call ntp_update_second() from tc_windup() and reset
> th_adjustment.  th_adjustment is then mixed into the scale for one UTC
> second.  This cycle slowly chips away at th_adjtimedelta, eventually
> reducing it to zero.
> 
> This is fine, except that using UTC for your update period requires
> you to work around how the UTC time can jump forward a huge amount.
> There are two notable jumps:
> 
> 1. The big jump forward to the RTC time during boot.
> 
> 2. The big jump forward to the RTC time after each resume.
> 
> To handle this we have a magic number in the code, LARGE_STEP.  If the
> UTC time jumps more than LARGE_STEP (200) seconds we truncate the
> number of ntp_update_second() calls to 2 to avoid looping endlessly in
> tc_windup().  Here we find a wart: we do 2 calls to account for a
> missed leap second, even though we no longer handle those in the
> kernel.
> 
> The magic number approach is less than ideal because it doesn't handle
> short suspends correctly: suspends shorter than 200 seconds are
> deducted from th_adjtimedelta even though we do not skew the clock
> during suspend.
> 
> Now that the timehands have a concept of "runtime" (time spent not
> suspended) I think it would be nicer if we called ntp_update_second()
> along an arbitrary period on the runtime clock.
> 
> So, this diff:
> 
> When adjtime(2) is called the NTP update period (th_next_ntp_update)
> is changed to align with the current runtime.  Thereafter, once per
> second, ntp_update_second() is called.
> 
> We don't deduct any skew from th_adjtimedelta across a big UTC jump
> (like a suspend) because the runtime clock does not advance while the
> machine is down.
> 
> Another upside is that skew changes via adjtime(2) happen immediately
> instead of being applied up to one second later.  For example, if the
> adjtime(2) skew is cancelled, the skew stops right away instead of
> continuing for up to one second.  This behavior seems more correct to
> me.
> 
> And, obviously, we can get rid of the magic number.
> 
> --
> 
> otto: Does the NTP algorithm *require* us to distribute the adjtime(2)
>       skew as we do?  At the start of the UTC second?  Or can we choose
>       an arbitrary starting point for the period like I do in this diff?
> 
>       My intuition is that this diff shouldn't break anything, and my
>       testing suggests it doesn't, but I'd appreciate a test all the same.

As far as I know, the NTP adjustment algorithm does not depend on a
particular point.

        -Otto

> 
> Index: kern_tc.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/kern_tc.c,v
> retrieving revision 1.62
> diff -u -p -r1.62 kern_tc.c
> --- kern_tc.c 6 Jul 2020 13:33:09 -0000       1.62
> +++ kern_tc.c 15 Jul 2020 13:56:22 -0000
> @@ -35,14 +35,6 @@
>  #include <sys/queue.h>
>  #include <sys/malloc.h>
>  
> -/*
> - * A large step happens on boot.  This constant detects such steps.
> - * It is relatively small so that ntp_update_second gets called enough
> - * in the typical 'missed a couple of seconds' case, but doesn't loop
> - * forever when the time step is large.
> - */
> -#define LARGE_STEP   200
> -
>  u_int dummy_get_timecount(struct timecounter *);
>  
>  int sysctl_tc_hardware(void *, size_t *, void *, size_t);
> @@ -77,6 +69,7 @@ struct timehands {
>       /* These fields must be initialized by the driver. */
>       struct timecounter      *th_counter;            /* [W] */
>       int64_t                 th_adjtimedelta;        /* [T,W] */
> +     struct bintime          th_next_ntp_update;     /* [T,W] */
>       int64_t                 th_adjustment;          /* [W] */
>       u_int64_t               th_scale;               /* [W] */
>       u_int                   th_offset_count;        /* [W] */
> @@ -564,12 +557,11 @@ void
>  tc_windup(struct bintime *new_boottime, struct bintime *new_offset,
>      int64_t *new_adjtimedelta)
>  {
> -     struct bintime bt;
> +     struct bintime diff, runtime, utc;
>       struct timecounter *active_tc;
>       struct timehands *th, *tho;
>       u_int64_t scale;
>       u_int delta, ncount, ogen;
> -     int i;
>  
>       if (new_boottime != NULL || new_adjtimedelta != NULL)
>               rw_assert_wrlock(&tc_lock);
> @@ -609,8 +601,8 @@ tc_windup(struct bintime *new_boottime, 
>        * accordingly.
>        */
>       if (new_offset != NULL && bintimecmp(&th->th_offset, new_offset, <)) {
> -             bintimesub(new_offset, &th->th_offset, &bt);
> -             bintimeadd(&th->th_naptime, &bt, &th->th_naptime);
> +             bintimesub(new_offset, &th->th_offset, &diff);
> +             bintimeadd(&th->th_naptime, &diff, &th->th_naptime);
>               th->th_offset = *new_offset;
>       }
>  
> @@ -633,30 +625,29 @@ tc_windup(struct bintime *new_boottime, 
>        */
>       if (new_boottime != NULL)
>               th->th_boottime = *new_boottime;
> -     if (new_adjtimedelta != NULL)
> +     if (new_adjtimedelta != NULL) {
>               th->th_adjtimedelta = *new_adjtimedelta;
> +             /* Reset the NTP update period. */
> +             bintimesub(&th->th_offset, &th->th_naptime,
> +                 &th->th_next_ntp_update);
> +     }
>  
>       /*
> -      * Deal with NTP second processing.  The for loop normally
> +      * Deal with NTP second processing.  The while-loop normally
>        * iterates at most once, but in extreme situations it might
> -      * keep NTP sane if timeouts are not run for several seconds.
> -      * At boot, the time step can be large when the TOD hardware
> -      * has been read, so on really large steps, we call
> -      * ntp_update_second only twice.  We need to call it twice in
> -      * case we missed a leap second.
> +      * keep NTP sane if tc_windup() is not run for several seconds.
>        */
> -     bt = th->th_offset;
> -     bintimeadd(&bt, &th->th_boottime, &bt);
> -     i = bt.sec - tho->th_microtime.tv_sec;
> -     if (i > LARGE_STEP)
> -             i = 2;
> -     for (; i > 0; i--)
> +     bintimesub(&th->th_offset, &th->th_naptime, &runtime);
> +     while (bintimecmp(&th->th_next_ntp_update, &runtime, <=)) {
>               ntp_update_second(th);
> +             th->th_next_ntp_update.sec++;
> +     }
>  
>       /* Update the UTC timestamps used by the get*() functions. */
>       /* XXX shouldn't do this here.  Should force non-`get' versions. */
> -     BINTIME_TO_TIMEVAL(&bt, &th->th_microtime);
> -     BINTIME_TO_TIMESPEC(&bt, &th->th_nanotime);
> +     bintimeadd(&th->th_boottime, &th->th_offset, &utc);
> +     BINTIME_TO_TIMEVAL(&utc, &th->th_microtime);
> +     BINTIME_TO_TIMESPEC(&utc, &th->th_nanotime);
>  
>       /* Now is a good time to change timecounters. */
>       if (th->th_counter != active_tc) {

Reply via email to