This is an automated email from the ASF dual-hosted git repository.
alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git
The following commit(s) were added to refs/heads/master by this push:
new 7d9d700 [clock] KUDU-3048 add builtin-NTP-specific metrics
7d9d700 is described below
commit 7d9d7009ac288327a54ea694d171f8c2ad23165d
Author: Alexey Serbin <[email protected]>
AuthorDate: Wed Feb 19 10:57:06 2020 -0800
[clock] KUDU-3048 add builtin-NTP-specific metrics
Added metrics specific to the built-in NTP client:
* info : difference between tracked true time and local wallclock
* debug: most recently computed true time
* debug: histogram on the maximum error of the computed true time
For testing, I manually ran kudu-master and verified the newly
introduced metrics are present and their values make sense.
I also re-visited the level of recently introduced metrics in the
HybridClock class.
Change-Id: Ia0a13d58c6e5da50ef7a98ae73de267ec4eab491
Reviewed-on: http://gerrit.cloudera.org:8080/15249
Tested-by: Kudu Jenkins
Reviewed-by: Adar Dembo <[email protected]>
---
src/kudu/clock/builtin_ntp.cc | 76 +++++++++++++++++++++++++++++++++++++++---
src/kudu/clock/builtin_ntp.h | 31 ++++++++++++++---
src/kudu/clock/hybrid_clock.cc | 24 ++++++++-----
src/kudu/clock/hybrid_clock.h | 5 ++-
src/kudu/clock/ntp-test.cc | 24 +++++++++----
5 files changed, 135 insertions(+), 25 deletions(-)
diff --git a/src/kudu/clock/builtin_ntp.cc b/src/kudu/clock/builtin_ntp.cc
index ee2893f..c9ed6aa 100644
--- a/src/kudu/clock/builtin_ntp.cc
+++ b/src/kudu/clock/builtin_ntp.cc
@@ -27,6 +27,7 @@
#include <cstdint>
#include <cstring>
#include <deque>
+#include <limits>
#include <memory>
#include <mutex>
#include <ostream>
@@ -38,6 +39,8 @@
#include <glog/logging.h>
#include "kudu/clock/builtin_ntp-internal.h"
+#include "kudu/gutil/bind.h"
+#include "kudu/gutil/bind_helpers.h"
#include "kudu/gutil/port.h"
#include "kudu/gutil/strings/join.h"
#include "kudu/gutil/strings/strcat.h"
@@ -47,6 +50,7 @@
#include "kudu/util/flag_tags.h"
#include "kudu/util/locks.h"
#include "kudu/util/logging.h"
+#include "kudu/util/metrics.h"
#include "kudu/util/monotime.h"
#include "kudu/util/net/net_util.h"
#include "kudu/util/net/sockaddr.h"
@@ -105,6 +109,29 @@
DEFINE_uint32(builtin_ntp_true_time_refresh_max_interval_s, 3600,
TAG_FLAG(builtin_ntp_true_time_refresh_max_interval_s, experimental);
TAG_FLAG(builtin_ntp_true_time_refresh_max_interval_s, runtime);
+METRIC_DEFINE_gauge_int64(server, builtin_ntp_local_clock_delta,
+ "Local Clock vs Built-In NTP True Time Delta",
+ kudu::MetricUnit::kMilliseconds,
+ "Delta between local clock and true time "
+ "tracked by built-in NTP client; set to "
+ "2^63-1 when true time is not tracked",
+ kudu::MetricLevel::kInfo);
+
+METRIC_DEFINE_gauge_int64(server, builtin_ntp_walltime,
+ "Built-in NTP Wall-Clock Time",
+ kudu::MetricUnit::kMicroseconds,
+ "Latest wall-clock time as tracked by "
+ "built-in NTP client",
+ kudu::MetricLevel::kDebug);
+
+METRIC_DEFINE_histogram(server, builtin_ntp_max_errors,
+ "Built-In NTP Maximum Errors",
+ kudu::MetricUnit::kMilliseconds,
+ "Statistics on the maximum true time error computed by
"
+ "built-in NTP client",
+ kudu::MetricLevel::kDebug,
+ 20000, 3);
+
using kudu::clock::internal::Interval;
using kudu::clock::internal::kIntervalNone;
using kudu::clock::internal::RecordedResponse;
@@ -445,7 +472,6 @@ class BuiltInNtp::ServerState {
}
private:
-
// Lock to protect internal state below from concurrent access.
mutable rw_spinlock lock_;
@@ -475,13 +501,16 @@ class BuiltInNtp::ServerState {
size_t o_pkt_total_num_; // number of NTP requests sent
};
-BuiltInNtp::BuiltInNtp()
+BuiltInNtp::BuiltInNtp(const scoped_refptr<MetricEntity>& metric_entity)
: rng_(GetRandomSeed32()) {
+ RegisterMetrics(metric_entity);
}
-BuiltInNtp::BuiltInNtp(vector<HostPort> servers)
+BuiltInNtp::BuiltInNtp(vector<HostPort> servers,
+ const scoped_refptr<MetricEntity>& metric_entity)
: rng_(GetRandomSeed32()) {
CHECK_OK(PopulateServers(std::move(servers)));
+ RegisterMetrics(metric_entity);
}
BuiltInNtp::~BuiltInNtp() {
@@ -1019,8 +1048,8 @@ Status BuiltInNtp::CombineClocks() {
falsetickers_num, all_sources_num);
}
- int64_t compute_wall = (best_interval.first + best_interval.second) / 2;
- int64_t compute_error = (best_interval.second - best_interval.first) / 2;
+ const int64_t compute_wall = (best_interval.first + best_interval.second) /
2;
+ const int64_t compute_error = (best_interval.second - best_interval.first) /
2;
{
// Extra sanity check to make sure walltime doesn't go back.
std::lock_guard<rw_spinlock> l(last_computed_lock_);
@@ -1036,6 +1065,9 @@ Status BuiltInNtp::CombineClocks() {
last_computed_.mono = now;
last_computed_.wall = compute_wall;
last_computed_.error = compute_error;
+
+ // Update stats on the computed error.
+ max_errors_histogram_->Increment(compute_error / 1000);
}
VLOG(2) << Substitute("combined clocks: $0 $1 $2",
now, compute_wall, compute_error);
@@ -1051,5 +1083,39 @@ Status BuiltInNtp::CombineClocks() {
return Status::OK();
}
+void BuiltInNtp::RegisterMetrics(const scoped_refptr<MetricEntity>& entity) {
+ METRIC_builtin_ntp_local_clock_delta.InstantiateFunctionGauge(
+ entity,
+ Bind(&BuiltInNtp::LocalClockDeltaForMetrics, Unretained(this)))->
+ AutoDetachToLastValue(&metric_detacher_);
+ METRIC_builtin_ntp_walltime.InstantiateFunctionGauge(
+ entity,
+ Bind(&BuiltInNtp::WalltimeForMetrics, Unretained(this)))->
+ AutoDetachToLastValue(&metric_detacher_);
+ max_errors_histogram_ =
+ METRIC_builtin_ntp_max_errors.Instantiate(entity);
+}
+
+int64_t BuiltInNtp::LocalClockDeltaForMetrics() {
+ uint64_t now = 0; // avoid clang-tidy warnings
+ uint64_t err_ignored;
+ auto s = WalltimeWithError(&now, &err_ignored);
+ const int64_t now_local = GetCurrentTimeMicros();
+ if (!s.ok()) {
+ return std::numeric_limits<int64_t>::max();
+ }
+
+ // Here we don't care much about scheduling anomalies to take into account
+ // the timing of the clock sampling above. There might be some delay between
+ // calls to WalltimeWithError() and GetCurrentTimeMicros(), but the metric
+ // is designed to spot relatively big offsets like few seconds and more.
+ return (now_local - static_cast<int64_t>(now)) / 1000;
+}
+
+int64_t BuiltInNtp::WalltimeForMetrics() {
+ shared_lock<rw_spinlock> l(last_computed_lock_);
+ return last_computed_.wall;
+}
+
} // namespace clock
} // namespace kudu
diff --git a/src/kudu/clock/builtin_ntp.h b/src/kudu/clock/builtin_ntp.h
index ffd8b9f..7b0dc8f 100644
--- a/src/kudu/clock/builtin_ntp.h
+++ b/src/kudu/clock/builtin_ntp.h
@@ -26,6 +26,7 @@
#include "kudu/gutil/macros.h"
#include "kudu/gutil/ref_counted.h"
#include "kudu/util/locks.h"
+#include "kudu/util/metrics.h"
#include "kudu/util/mutex.h"
#include "kudu/util/net/socket.h"
#include "kudu/util/random.h"
@@ -67,12 +68,15 @@ constexpr const int kStandardNtpPort = 123;
class BuiltInNtp : public TimeService {
public:
// Create an instance using the servers specified in --builtin_ntp_servers
- // as NTP sources.
- BuiltInNtp();
+ // as NTP sources. The 'metric_entity' is used to register metrics specific
to
+ // the built-in NTP client.
+ explicit BuiltInNtp(const scoped_refptr<MetricEntity>& metric_entity);
// Create an instance using the specified servers as NTP sources. The set
- // of source NTP servers must not be empty.
- explicit BuiltInNtp(std::vector<HostPort> servers);
+ // of source NTP servers must not be empty. The 'metric_entity' is used
+ // to register metrics specific to the built-in NTP client.
+ BuiltInNtp(std::vector<HostPort> servers,
+ const scoped_refptr<MetricEntity>& metric_entity);
~BuiltInNtp() override;
@@ -161,6 +165,16 @@ class BuiltInNtp : public TimeService {
// based on responses received so far from configured NTP servers.
Status CombineClocks();
+ // Register all metrics tracked by the built-in NTP client.
+ void RegisterMetrics(const scoped_refptr<MetricEntity>& entity);
+
+ // Get difference between the local clock and the tracked true time,
+ // in milliseconds.
+ int64_t LocalClockDeltaForMetrics();
+
+ // Get the latest computed wall-clock time, in microseconds.
+ int64_t WalltimeForMetrics();
+
Random rng_;
Socket socket_;
@@ -186,6 +200,15 @@ class BuiltInNtp : public TimeService {
// updating the maintained walltime based on the NTP responses received.
scoped_refptr<Thread> thread_;
+ // Stats on the maximum error is sampled every when wall-clock time
+ // is calculated upon receiving NTP server responses.
+ scoped_refptr<Histogram> max_errors_histogram_;
+
+ // Clock metrics are set to detach to their last value. This means
+ // that, during our destructor, we'll need to access other class members
+ // declared above this. Hence, this member must be declared last.
+ FunctionGaugeDetacher metric_detacher_;
+
DISALLOW_COPY_AND_ASSIGN(BuiltInNtp);
};
diff --git a/src/kudu/clock/hybrid_clock.cc b/src/kudu/clock/hybrid_clock.cc
index fbb97d0..03f3f27 100644
--- a/src/kudu/clock/hybrid_clock.cc
+++ b/src/kudu/clock/hybrid_clock.cc
@@ -161,23 +161,27 @@ METRIC_DEFINE_gauge_bool(server,
hybrid_clock_extrapolating,
"Whether HybridClock timestamps are extrapolated "
"because of inability to read the underlying clock",
kudu::MetricLevel::kWarn);
+
METRIC_DEFINE_gauge_uint64(server, hybrid_clock_error,
"Hybrid Clock Error",
kudu::MetricUnit::kMicroseconds,
"Server clock maximum error",
kudu::MetricLevel::kInfo);
+
METRIC_DEFINE_gauge_uint64(server, hybrid_clock_timestamp,
"Hybrid Clock Timestamp",
kudu::MetricUnit::kMicroseconds,
"Hybrid clock timestamp",
kudu::MetricLevel::kInfo);
+
METRIC_DEFINE_histogram(server, hybrid_clock_max_errors,
"Hybrid Clock Maximum Errors",
kudu::MetricUnit::kMilliseconds,
"The statistics on the maximum error of the underlying
"
"clock",
- kudu::MetricLevel::kInfo,
+ kudu::MetricLevel::kDebug,
20000, 3);
+
METRIC_DEFINE_histogram(server, hybrid_clock_extrapolation_intervals,
"Intervals of Hybrid Clock Extrapolation",
kudu::MetricUnit::kSeconds,
@@ -210,20 +214,21 @@ Status CheckDeadlineNotWithinMicros(const MonoTime&
deadline, int64_t wait_for_u
HybridClock::HybridClock(const scoped_refptr<MetricEntity>& metric_entity)
: next_timestamp_(0),
- state_(kNotInitialized) {
+ state_(kNotInitialized),
+ metric_entity_(metric_entity) {
DCHECK(metric_entity);
max_errors_histogram_ =
- METRIC_hybrid_clock_max_errors.Instantiate(metric_entity);
+ METRIC_hybrid_clock_max_errors.Instantiate(metric_entity_);
extrapolation_intervals_histogram_ =
- METRIC_hybrid_clock_extrapolation_intervals.Instantiate(metric_entity);
- extrapolating_ = metric_entity->FindOrCreateGauge(
+ METRIC_hybrid_clock_extrapolation_intervals.Instantiate(metric_entity_);
+ extrapolating_ = metric_entity_->FindOrCreateGauge(
&METRIC_hybrid_clock_extrapolating, false, MergeType::kMax);
METRIC_hybrid_clock_timestamp.InstantiateFunctionGauge(
- metric_entity,
+ metric_entity_,
Bind(&HybridClock::NowForMetrics, Unretained(this)))->
AutoDetachToLastValue(&metric_detacher_);
METRIC_hybrid_clock_error.InstantiateFunctionGauge(
- metric_entity,
+ metric_entity_,
Bind(&HybridClock::ErrorForMetrics, Unretained(this)))->
AutoDetachToLastValue(&metric_detacher_);
}
@@ -487,8 +492,9 @@ Status HybridClock::InitWithTimeSource(TimeSource
time_source,
switch (time_source) {
case TimeSource::NTP_SYNC_BUILTIN:
time_service_.reset(builtin_ntp_servers.empty()
- ? new clock::BuiltInNtp
- : new
clock::BuiltInNtp(std::move(builtin_ntp_servers)));
+ ? new clock::BuiltInNtp(metric_entity_)
+ : new
clock::BuiltInNtp(std::move(builtin_ntp_servers),
+ metric_entity_));
break;
#if defined(KUDU_HAS_SYSTEM_TIME_SOURCE)
case TimeSource::NTP_SYNC_SYSTEM:
diff --git a/src/kudu/clock/hybrid_clock.h b/src/kudu/clock/hybrid_clock.h
index d36d5e2..545ba0a 100644
--- a/src/kudu/clock/hybrid_clock.h
+++ b/src/kudu/clock/hybrid_clock.h
@@ -258,6 +258,9 @@ class HybridClock : public Clock {
};
State state_;
+ // Metric entity.
+ scoped_refptr<MetricEntity> metric_entity_;
+
// Whether the hybrid clock is extrapolating the readings of the underlying
// clock instead of using the real ones. It's important to know whether
// the extrapolation is happening, but 'extrapolation_intervals_histogram_'
@@ -265,7 +268,7 @@ class HybridClock : public Clock {
// extrapolation interval.
scoped_refptr<AtomicGauge<bool>> extrapolating_;
- // Stats on the underlying clock's 'maximum clock' metric sampled every
+ // Stats on the underlying clock's 'maximum error' metric sampled every
// NowWithError() call (essentially, every call when requesting a hybrid
clock
// timestamp).
scoped_refptr<Histogram> max_errors_histogram_;
diff --git a/src/kudu/clock/ntp-test.cc b/src/kudu/clock/ntp-test.cc
index da558b2..5403144 100644
--- a/src/kudu/clock/ntp-test.cc
+++ b/src/kudu/clock/ntp-test.cc
@@ -40,10 +40,12 @@
#include "kudu/clock/test/mini_chronyd.h"
#include "kudu/clock/test/mini_chronyd_test_util.h"
#include "kudu/gutil/integral_types.h"
+#include "kudu/gutil/ref_counted.h"
#include "kudu/gutil/stringprintf.h"
#include "kudu/gutil/strings/substitute.h"
#include "kudu/util/cloud/instance_detector.h"
#include "kudu/util/cloud/instance_metadata.h"
+#include "kudu/util/metrics.h"
#include "kudu/util/monotime.h"
#include "kudu/util/net/net_util.h"
#include "kudu/util/status.h"
@@ -56,6 +58,8 @@ DECLARE_string(builtin_ntp_servers);
DECLARE_uint32(builtin_ntp_poll_interval_ms);
DECLARE_uint32(cloud_metadata_server_request_timeout_ms);
+METRIC_DECLARE_entity(server);
+
using kudu::clock::internal::FindIntersection;
using kudu::clock::internal::Interval;
using kudu::clock::internal::RecordedResponse;
@@ -435,8 +439,13 @@ TEST(TimeIntervalsTest, ThreeResponses) {
//
// TODO(aserbin): fix the described chrony's bug, at least in thirdparty
class BuiltinNtpWithMiniChronydTest: public KuduTest {
+ public:
+ BuiltinNtpWithMiniChronydTest()
+ : metric_entity_(METRIC_ENTITY_server.Instantiate(&metric_registry_,
+ "ntp-test")) {
+ }
protected:
- void CheckInitUnsync(BuiltInNtp* ntp_client) {
+ static void CheckInitUnsync(BuiltInNtp* ntp_client) {
auto deadline = MonoTime::Now() +
MonoDelta::FromSeconds(FLAGS_ntp_initial_sync_wait_secs);
ASSERT_OK(ntp_client->Init());
@@ -469,6 +478,9 @@ class BuiltinNtpWithMiniChronydTest: public KuduTest {
ASSERT_TRUE(s.IsRuntimeError()) << s.ToString();
ASSERT_STR_CONTAINS(s.ToString(), "No suitable source for
synchronisation");
}
+
+ MetricRegistry metric_registry_;
+ scoped_refptr<MetricEntity> metric_entity_;
};
// This is a basic scenario to verify that built-in NTP client is able
@@ -493,7 +505,7 @@ TEST_F(BuiltinNtpWithMiniChronydTest, Basic) {
// chronyd supports very short polling intervals (down to 1/64 second).
FLAGS_builtin_ntp_poll_interval_ms = 50;
- BuiltInNtp c(servers_endpoints);
+ BuiltInNtp c(servers_endpoints, metric_entity_);
ASSERT_OK(c.Init());
ASSERT_EVENTUALLY([&] {
uint64_t now_us;
@@ -543,7 +555,7 @@ TEST_F(BuiltinNtpWithMiniChronydTest,
NoIntersectionIntervalAtStartup) {
// chronyd supports very short polling intervals (down to 1/64 second).
FLAGS_builtin_ntp_poll_interval_ms = 50;
- BuiltInNtp c(servers_endpoints);
+ BuiltInNtp c(servers_endpoints, metric_entity_);
NO_FATALS(CheckInitUnsync(&c));
NO_FATALS(CheckWallTimeUnavailable(&c));
}
@@ -636,7 +648,7 @@ TEST_F(BuiltinNtpWithMiniChronydTest,
SyncAndUnsyncReferenceServers) {
// Verify chronyd's client itself does not accept the set of of NTP
sources.
NO_FATALS(CheckNoNtpSource(refs));
- BuiltInNtp c(refs);
+ BuiltInNtp c(refs, metric_entity_);
NO_FATALS(CheckInitUnsync(&c));
NO_FATALS(CheckWallTimeUnavailable(&c));
};
@@ -678,7 +690,7 @@ TEST_F(BuiltinNtpWithMiniChronydTest,
SyncAndUnsyncReferenceServers) {
// Verify chronyd's client itself accepts the set of of NTP sources.
ASSERT_OK(MiniChronyd::CheckNtpSource(refs));
- BuiltInNtp c(refs);
+ BuiltInNtp c(refs, metric_entity_);
ASSERT_OK(c.Init());
ASSERT_EVENTUALLY([&] {
uint64_t now_us;
@@ -731,7 +743,7 @@ TEST_F(BuiltinNtpWithMiniChronydTest,
CloudInstanceNtpServer) {
ASSERT_OK(MiniChronyd::CheckNtpSource(servers_endpoints));
FLAGS_builtin_ntp_poll_interval_ms = 500;
- BuiltInNtp c(servers_endpoints);
+ BuiltInNtp c(servers_endpoints, metric_entity_);
ASSERT_OK(c.Init());
ASSERT_EVENTUALLY([&] {
uint64_t now_us;