This is an automated email from the ASF dual-hosted git repository.
adar pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git
The following commit(s) were added to refs/heads/master by this push:
new 7a6df6a system_ntp: wait for sync via kernel
7a6df6a is described below
commit 7a6df6aa18ff31b7947890cbc44b49189a23a5bf
Author: Adar Dembo <[email protected]>
AuthorDate: Mon May 13 14:39:52 2019 -0700
system_ntp: wait for sync via kernel
We've been having more and more trouble with dist-test slave clocks becoming
desynchronized. Because slaves run inside docker containers, we can't use
ntpd/chrony-based tools such as ntp-wait to wait for synchronization.
Moreover, this highlights a general issue with the existing wait-for-sync
approach: we can't differentiate between environments where ntpd isn't
running but should be, and environments where it's not but synchronization
is still expected.
This patch switches from ntp-wait and friends to ntp_adjtime-based waiting.
Originally it also cut the wait time from 60s to 30s (as an overture for
users running in the first environment who would prefer not to wait a full
minute to know that their ntpd isn't running), but this proved to be
insufficient to ride out dist-test clock desynchronization.
I also looked at the source code for ntp-wait and chronyc, but didn't see
them doing anything fundamentally more interesting than what we're now
doing with ntp_adjtime.
Change-Id: Ic133c4f9b5fd933216fb27f9f01396a5fee8276b
Reviewed-on: http://gerrit.cloudera.org:8080/13323
Tested-by: Kudu Jenkins
Reviewed-by: Will Berkeley <[email protected]>
---
src/kudu/clock/system_ntp.cc | 45 ++++++++++++++++++--------------------------
1 file changed, 18 insertions(+), 27 deletions(-)
diff --git a/src/kudu/clock/system_ntp.cc b/src/kudu/clock/system_ntp.cc
index 108056a..9993838 100644
--- a/src/kudu/clock/system_ntp.cc
+++ b/src/kudu/clock/system_ntp.cc
@@ -35,6 +35,7 @@
#include "kudu/util/errno.h"
#include "kudu/util/flag_tags.h"
#include "kudu/util/logging.h"
+#include "kudu/util/monotime.h"
#include "kudu/util/path_util.h"
#include "kudu/util/status.h"
#include "kudu/util/subprocess.h"
@@ -124,36 +125,26 @@ Status WaitForNtp() {
}
LOG(INFO) << Substitute("Waiting up to --ntp_initial_sync_wait_secs=$0 "
"seconds for the clock to synchronize", wait_secs);
- vector<string> cmd;
- string exe;
- Status s = FindExecutable("ntp-wait", {"/sbin", "/usr/sbin"}, &exe);
- if (s.ok()) {
- // -s is the number of seconds to sleep between retries.
- // -n is the number of tries before giving up.
- cmd = {exe, "-s", "1", "-n", std::to_string(wait_secs)};
- } else {
- LOG(WARNING) << "Could not find ntp-wait; trying chrony waitsync instead: "
- << s.ToString();
- s = FindExecutable("chronyc", {"/sbin", "/usr/sbin"}, &exe);
- if (!s.ok()) {
- LOG(WARNING) << "Could not find chronyc: " << s.ToString();
- return Status::NotFound("failed to find ntp-wait or chronyc");
+
+ // We previously relied on ntpd/chrony support tools to wait, but that
+ // approach doesn't work in environments where ntpd is unreachable but the
+ // clock is still synchronized (i.e. running inside a Linux container).
+ //
+ // Now we just interrogate the kernel directly.
+ Status s;
+ for (int i = 0; i < wait_secs; i++) {
+ timex timex;
+ s = CallAdjTime(&timex);
+ if (s.ok() || !s.IsServiceUnavailable()) {
+ return s;
}
- // Usage: waitsync max-tries max-correction max-skew interval.
- // max-correction and max-skew parameters as 0 means no checks.
- // The interval is measured in seconds.
- cmd = {exe, "waitsync", std::to_string(wait_secs), "0", "0", "1"};
- }
- // Unfortunately, neither ntp-wait nor chronyc waitsync print useful
messages.
- // Instead, rely on DumpDiagnostics.
- s = Subprocess::Call(cmd);
- if (!s.ok()) {
- return s.CloneAndPrepend(
- Substitute("failed to wait for clock sync using command '$0'",
- JoinStrings(cmd, " ")));
+ SleepFor(MonoDelta::FromSeconds(1));
}
- return Status::OK();
+
+ // Return the last failure.
+ return s.CloneAndPrepend("Timed out waiting for clock sync");
}
+
} // anonymous namespace
void SystemNtp::DumpDiagnostics(vector<string>* log) const {