This is an automated email from the ASF dual-hosted git repository.
alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git
The following commit(s) were added to refs/heads/master by this push:
new 4859d2902 [clock] KUDU-3521 fix crash when clock is synchronized by
PTPd
4859d2902 is described below
commit 4859d290277bf36f0bd84891c4764194c2cf9521
Author: Alexey Serbin <[email protected]>
AuthorDate: Wed Oct 25 16:38:21 2023 -0700
[clock] KUDU-3521 fix crash when clock is synchronized by PTPd
There was an earlier attempt to address the issue [1], but the fix
hasn't received +2 since there was not enough evidence behind the
root cause analysis. With the report on #kudu-general Slack
channel [2], from the analysis of the code [3] it's easy to see there
isn't any other way to get such a manifestation of the issue but
a negative value for the 'maxerror' field of the 'timex' structure
returned by the ntp_adjtime()/adjtimex() system call.
The essence of the problem is that in Kudu the maximum error is supposed
to be a non-negative number. This patch addresses the issue.
[1] https://gerrit.cloudera.org/#/c/12149/
[2] https://getkudu.slack.com/archives/C0CPXJ3CH/p1698246065354269
[3]
https://github.com/apache/kudu/blob/04fdbd0974f4418295d57c0daa4b67de3e777a43/src/kudu/clock/hybrid_clock.cc#L627-L706
Change-Id: Ibbe1a50c4857b9742d2ffde35440d0dee082edc0
Reviewed-on: http://gerrit.cloudera.org:8080/20626
Tested-by: Kudu Jenkins
Reviewed-by: Yingchun Lai <[email protected]>
Reviewed-by: Abhishek Chennaka <[email protected]>
---
src/kudu/clock/builtin_ntp.cc | 12 ++++++------
src/kudu/clock/hybrid_clock.cc | 27 +++++++++++++++++++--------
src/kudu/clock/system_ntp.cc | 3 ++-
3 files changed, 27 insertions(+), 15 deletions(-)
diff --git a/src/kudu/clock/builtin_ntp.cc b/src/kudu/clock/builtin_ntp.cc
index f5f416e82..348a85e8b 100644
--- a/src/kudu/clock/builtin_ntp.cc
+++ b/src/kudu/clock/builtin_ntp.cc
@@ -24,7 +24,7 @@
#include <cerrno>
#include <cstdint>
-#include <cstring>
+#include <cstdlib>
#include <deque>
#include <functional>
#include <limits>
@@ -569,19 +569,19 @@ Status BuiltInNtp::WalltimeWithError(uint64_t* now_usec,
uint64_t* error_usec) {
const auto mono = GetMonoTimeMicrosRaw();
DCHECK_GE(mono, last.mono);
- const int64_t delta_t = mono - last.mono;
+ const int64_t delta = mono - last.mono;
if (PREDICT_FALSE(MonoDelta::FromSeconds(
FLAGS_builtin_ntp_true_time_refresh_max_interval_s) <
- MonoDelta::FromMicroseconds(delta_t))) {
+ MonoDelta::FromMicroseconds(delta))) {
return Status::ServiceUnavailable(Substitute(
"$0: too long after last true time refresh",
- MonoDelta::FromMicroseconds(delta_t).ToString()));
+ MonoDelta::FromMicroseconds(delta).ToString()));
}
// TODO(KUDU-2940): apply measured local clock skew against the true time
// clock when computing projected wallclock reading and
error
- *now_usec = delta_t + last.wall;
- *error_usec = last.error + delta_t * kSkewPpm / 1e6;
+ *now_usec = delta + last.wall;
+ *error_usec = std::abs(last.error + (delta * kSkewPpm / 1000000));
return Status::OK();
}
diff --git a/src/kudu/clock/hybrid_clock.cc b/src/kudu/clock/hybrid_clock.cc
index 820d93c4e..f9c0a7816 100644
--- a/src/kudu/clock/hybrid_clock.cc
+++ b/src/kudu/clock/hybrid_clock.cc
@@ -672,9 +672,9 @@ Status HybridClock::InitWithTimeSource(TimeSource
time_source) {
}
Status HybridClock::WalltimeWithError(uint64_t* now_usec, uint64_t*
error_usec) {
- auto read_time_before = MonoTime::Now();
+ const auto read_time_before = MonoTime::Now();
Status s = time_service_->WalltimeWithError(now_usec, error_usec);
- auto read_time_after = MonoTime::Now();
+ const auto read_time_after = MonoTime::Now();
bool is_extrapolated = false;
if (PREDICT_TRUE(s.ok())) {
@@ -697,8 +697,15 @@ Status HybridClock::WalltimeWithError(uint64_t* now_usec,
uint64_t* error_usec)
// The max likelihood estimate is that 'B' corresponds to the average of
'A'
// and 'C'. Then we need to add in this uncertainty (half of C - A) into
any
// future clock readings that we extrapolate from this estimate.
- int64_t read_duration_us = (read_time_after -
read_time_before).ToMicroseconds();
- int64_t read_time_error_us = read_duration_us / 2;
+ const int64_t read_duration_us = (read_time_after -
read_time_before).ToMicroseconds();
+
+ // Per manual page of clock_gettime(CLOCK_MONOTONIC) [1],
'read_duration_us'
+ // is guaranteed to be a non-negative number. Just out of paranoia,
+ // transform it into 0 if it turns to be a negative number.
+ //
+ // [1] https://man7.org/linux/man-pages/man3/clock_gettime.3.html
+ const uint64_t read_time_error_us =
+ (read_duration_us > 0) ? read_duration_us / 2 : 0;
MonoTime read_time_max_likelihood = read_time_before +
MonoDelta::FromMicroseconds(read_time_error_us);
@@ -724,12 +731,16 @@ Status HybridClock::WalltimeWithError(uint64_t* now_usec,
uint64_t* error_usec)
is_extrapolating_ = true;
extrapolating_->set_value(is_extrapolating_);
}
- if (!last_clock_read_time_.Initialized()) {
+ if (PREDICT_FALSE(!last_clock_read_time_.Initialized())) {
RETURN_NOT_OK_PREPEND(s, "could not read system time source");
}
MonoDelta time_since_last_read = read_time_after - last_clock_read_time_;
- int64_t micros_since_last_read = time_since_last_read.ToMicroseconds();
- int64_t accum_error_us = (micros_since_last_read *
time_service_->skew_ppm()) / 1000000;
+ const int64_t micros_since_last_read =
time_since_last_read.ToMicroseconds();
+ if (PREDICT_FALSE(micros_since_last_read < 0)) {
+ RETURN_NOT_OK_PREPEND(s, "inconsistent readings of the monotonic clock");
+ }
+ const uint64_t accum_error_us =
+ std::abs(micros_since_last_read * time_service_->skew_ppm()) / 1000000;
*now_usec = last_clock_read_physical_ + micros_since_last_read;
*error_usec = last_clock_read_error_ + accum_error_us;
is_extrapolated = true;
@@ -749,7 +760,7 @@ Status HybridClock::WalltimeWithError(uint64_t* now_usec,
uint64_t* error_usec)
*error_usec,
is_extrapolated ? "unsynchronized" : "synchronized"));
}
- return kudu::Status::OK();
+ return Status::OK();
}
// Used to get the timestamp for metrics.
diff --git a/src/kudu/clock/system_ntp.cc b/src/kudu/clock/system_ntp.cc
index ca8965c14..f84248bfe 100644
--- a/src/kudu/clock/system_ntp.cc
+++ b/src/kudu/clock/system_ntp.cc
@@ -21,6 +21,7 @@
#include <sys/timex.h>
#include <cerrno>
+#include <cstdlib>
#include <functional>
#include <limits>
#include <ostream>
@@ -173,7 +174,7 @@ Status SystemNtp::WalltimeWithError(uint64_t* now_usec,
uint64_t* error_usec) {
uint64_t now = static_cast<uint64_t>(t.time.tv_sec) * 1000000 +
t.time.tv_usec;
#endif
- *error_usec = t.maxerror;
+ *error_usec = std::abs(t.maxerror);
*now_usec = now;
return Status::OK();
}