This is an automated email from the ASF dual-hosted git repository.

adar pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git


The following commit(s) were added to refs/heads/master by this push:
     new 7a6df6a  system_ntp: wait for sync via kernel
7a6df6a is described below

commit 7a6df6aa18ff31b7947890cbc44b49189a23a5bf
Author: Adar Dembo <[email protected]>
AuthorDate: Mon May 13 14:39:52 2019 -0700

    system_ntp: wait for sync via kernel
    
    We've been having more and more trouble with dist-test slave clocks becoming
    desynchronized. Because slaves run inside docker containers, we can't use
    ntpd/chrony-based tools such as ntp-wait to wait for synchronization.
    Moreover, this highlights a general issue with the existing wait-for-sync
    approach: we can't differentiate between environments where ntpd isn't
    running but should be, and environments where it's not but synchronization
    is still expected.
    
    This patch switches from ntp-wait and friends to ntp_adjtime-based waiting.
    Originally it also cut the wait time from 60s to 30s (as an overture for
    users running in the first environment who would prefer not to wait a full
    minute to know that their ntpd isn't running), but this proved to be
    insufficient to ride out dist-test clock desynchronization.
    
    I also looked at the source code for ntp-wait and chronyc, but didn't see
    them doing anything fundamentally more interesting than what we're now
    doing with ntp_adjtime.
    
    Change-Id: Ic133c4f9b5fd933216fb27f9f01396a5fee8276b
    Reviewed-on: http://gerrit.cloudera.org:8080/13323
    Tested-by: Kudu Jenkins
    Reviewed-by: Will Berkeley <[email protected]>
---
 src/kudu/clock/system_ntp.cc | 45 ++++++++++++++++++--------------------------
 1 file changed, 18 insertions(+), 27 deletions(-)

diff --git a/src/kudu/clock/system_ntp.cc b/src/kudu/clock/system_ntp.cc
index 108056a..9993838 100644
--- a/src/kudu/clock/system_ntp.cc
+++ b/src/kudu/clock/system_ntp.cc
@@ -35,6 +35,7 @@
 #include "kudu/util/errno.h"
 #include "kudu/util/flag_tags.h"
 #include "kudu/util/logging.h"
+#include "kudu/util/monotime.h"
 #include "kudu/util/path_util.h"
 #include "kudu/util/status.h"
 #include "kudu/util/subprocess.h"
@@ -124,36 +125,26 @@ Status WaitForNtp() {
   }
   LOG(INFO) << Substitute("Waiting up to --ntp_initial_sync_wait_secs=$0 "
                           "seconds for the clock to synchronize", wait_secs);
-  vector<string> cmd;
-  string exe;
-  Status s = FindExecutable("ntp-wait", {"/sbin", "/usr/sbin"}, &exe);
-  if (s.ok()) {
-    // -s is the number of seconds to sleep between retries.
-    // -n is the number of tries before giving up.
-    cmd = {exe, "-s", "1", "-n", std::to_string(wait_secs)};
-  } else {
-    LOG(WARNING) << "Could not find ntp-wait; trying chrony waitsync instead: "
-                 << s.ToString();
-    s = FindExecutable("chronyc", {"/sbin", "/usr/sbin"}, &exe);
-    if (!s.ok()) {
-      LOG(WARNING) << "Could not find chronyc: " << s.ToString();
-      return Status::NotFound("failed to find ntp-wait or chronyc");
+
+  // We previously relied on ntpd/chrony support tools to wait, but that
+  // approach doesn't work in environments where ntpd is unreachable but the
+  // clock is still synchronized (i.e. running inside a Linux container).
+  //
+  // Now we just interrogate the kernel directly.
+  Status s;
+  for (int i = 0; i < wait_secs; i++) {
+    timex timex;
+    s = CallAdjTime(&timex);
+    if (s.ok() || !s.IsServiceUnavailable()) {
+      return s;
     }
-    // Usage: waitsync max-tries max-correction max-skew interval.
-    // max-correction and max-skew parameters as 0 means no checks.
-    // The interval is measured in seconds.
-    cmd = {exe, "waitsync", std::to_string(wait_secs), "0", "0", "1"};
-  }
-  // Unfortunately, neither ntp-wait nor chronyc waitsync print useful 
messages.
-  // Instead, rely on DumpDiagnostics.
-  s = Subprocess::Call(cmd);
-  if (!s.ok()) {
-    return s.CloneAndPrepend(
-        Substitute("failed to wait for clock sync using command '$0'",
-                   JoinStrings(cmd, " ")));
+    SleepFor(MonoDelta::FromSeconds(1));
   }
-  return Status::OK();
+
+  // Return the last failure.
+  return s.CloneAndPrepend("Timed out waiting for clock sync");
 }
+
 } // anonymous namespace
 
 void SystemNtp::DumpDiagnostics(vector<string>* log) const {

Reply via email to