Hi,
adjtime(2) skews the clock at up to 5000ppm per second. The way this
actually happens is pretty straightforward: at the start of every UTC
second we call ntp_update_second() from tc_windup() and reset
th_adjustment. th_adjustment is then mixed into the scale for one UTC
second. This cycle slowly chips away at th_adjtimedelta, eventually
reducing it to zero.
This is fine, except that using UTC for your update period requires
you to work around how the UTC time can jump forward a huge amount.
There are two notable jumps:
1. The big jump forward to the RTC time during boot.
2. The big jump forward to the RTC time after each resume.
To handle this we have a magic number in the code, LARGE_STEP. If the
UTC time jumps more than LARGE_STEP (200) seconds we truncate the
number of ntp_update_second() calls to 2 to avoid looping endlessly in
tc_windup(). Here we find a wart: we do 2 calls to account for a
missed leap second, even though we no longer handle those in the
kernel.
The magic number approach is less than ideal because it doesn't handle
short suspends correctly: suspends shorter than 200 seconds are
deducted from th_adjtimedelta even though we do not skew the clock
during suspend.
Now that the timehands have a concept of "runtime" (time spent not
suspended) I think it would be nicer if we called ntp_update_second()
along an arbitrary period on the runtime clock.
So, this diff:
When adjtime(2) is called the NTP update period (th_next_ntp_update)
is changed to align with the current runtime. Thereafter, once per
second, ntp_update_second() is called.
We don't deduct any skew from th_adjtimedelta across a big UTC jump
(like a suspend) because the runtime clock does not advance while the
machine is down.
Another upside is that skew changes via adjtime(2) happen immediately
instead of being applied up to one second later. For example, if the
adjtime(2) skew is cancelled, the skew stops right away instead of
continuing for up to one second. This behavior seems more correct to
me.
And, obviously, we can get rid of the magic number.
--
otto: Does the NTP algorithm *require* us to distribute the adjtime(2)
skew as we do? At the start of the UTC second? Or can we choose
an arbitrary starting point for the period like I do in this diff?
My intuition is that this diff shouldn't break anything, and my
testing suggests it doesn't, but I'd appreciate a test all the same.
Index: kern_tc.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_tc.c,v
retrieving revision 1.62
diff -u -p -r1.62 kern_tc.c
--- kern_tc.c 6 Jul 2020 13:33:09 -0000 1.62
+++ kern_tc.c 15 Jul 2020 13:56:22 -0000
@@ -35,14 +35,6 @@
#include <sys/queue.h>
#include <sys/malloc.h>
-/*
- * A large step happens on boot. This constant detects such steps.
- * It is relatively small so that ntp_update_second gets called enough
- * in the typical 'missed a couple of seconds' case, but doesn't loop
- * forever when the time step is large.
- */
-#define LARGE_STEP 200
-
u_int dummy_get_timecount(struct timecounter *);
int sysctl_tc_hardware(void *, size_t *, void *, size_t);
@@ -77,6 +69,7 @@ struct timehands {
/* These fields must be initialized by the driver. */
struct timecounter *th_counter; /* [W] */
int64_t th_adjtimedelta; /* [T,W] */
+ struct bintime th_next_ntp_update; /* [T,W] */
int64_t th_adjustment; /* [W] */
u_int64_t th_scale; /* [W] */
u_int th_offset_count; /* [W] */
@@ -564,12 +557,11 @@ void
tc_windup(struct bintime *new_boottime, struct bintime *new_offset,
int64_t *new_adjtimedelta)
{
- struct bintime bt;
+ struct bintime diff, runtime, utc;
struct timecounter *active_tc;
struct timehands *th, *tho;
u_int64_t scale;
u_int delta, ncount, ogen;
- int i;
if (new_boottime != NULL || new_adjtimedelta != NULL)
rw_assert_wrlock(&tc_lock);
@@ -609,8 +601,8 @@ tc_windup(struct bintime *new_boottime,
* accordingly.
*/
if (new_offset != NULL && bintimecmp(&th->th_offset, new_offset, <)) {
- bintimesub(new_offset, &th->th_offset, &bt);
- bintimeadd(&th->th_naptime, &bt, &th->th_naptime);
+ bintimesub(new_offset, &th->th_offset, &diff);
+ bintimeadd(&th->th_naptime, &diff, &th->th_naptime);
th->th_offset = *new_offset;
}
@@ -633,30 +625,29 @@ tc_windup(struct bintime *new_boottime,
*/
if (new_boottime != NULL)
th->th_boottime = *new_boottime;
- if (new_adjtimedelta != NULL)
+ if (new_adjtimedelta != NULL) {
th->th_adjtimedelta = *new_adjtimedelta;
+ /* Reset the NTP update period. */
+ bintimesub(&th->th_offset, &th->th_naptime,
+ &th->th_next_ntp_update);
+ }
/*
- * Deal with NTP second processing. The for loop normally
+ * Deal with NTP second processing. The while-loop normally
* iterates at most once, but in extreme situations it might
- * keep NTP sane if timeouts are not run for several seconds.
- * At boot, the time step can be large when the TOD hardware
- * has been read, so on really large steps, we call
- * ntp_update_second only twice. We need to call it twice in
- * case we missed a leap second.
+ * keep NTP sane if tc_windup() is not run for several seconds.
*/
- bt = th->th_offset;
- bintimeadd(&bt, &th->th_boottime, &bt);
- i = bt.sec - tho->th_microtime.tv_sec;
- if (i > LARGE_STEP)
- i = 2;
- for (; i > 0; i--)
+ bintimesub(&th->th_offset, &th->th_naptime, &runtime);
+ while (bintimecmp(&th->th_next_ntp_update, &runtime, <=)) {
ntp_update_second(th);
+ th->th_next_ntp_update.sec++;
+ }
/* Update the UTC timestamps used by the get*() functions. */
/* XXX shouldn't do this here. Should force non-`get' versions. */
- BINTIME_TO_TIMEVAL(&bt, &th->th_microtime);
- BINTIME_TO_TIMESPEC(&bt, &th->th_nanotime);
+ bintimeadd(&th->th_boottime, &th->th_offset, &utc);
+ BINTIME_TO_TIMEVAL(&utc, &th->th_microtime);
+ BINTIME_TO_TIMESPEC(&utc, &th->th_nanotime);
/* Now is a good time to change timecounters. */
if (th->th_counter != active_tc) {