This is an automated email from the ASF dual-hosted git repository. alexey pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/kudu.git
commit 6371ffdb54fa09f891e01a120e74993f38286c9f Author: Alexey Serbin <[email protected]> AuthorDate: Tue Feb 11 20:56:05 2020 -0800 [hybrid clock] KUDU-3048 introduce new clock metrics Introduced additional metrics for the hybrid clock: * whether hybrid clock is using extrapolated readings for the underlying clock instead of actual readings * histogram for the duration of intervals when the underlying clock was extrapolated * histogram for the maximum errors reported by the underlying clock I ran a small Kudu cluster to manually verify the behavior of the newly introduced metrics: I'm not sure it's worth adding automated tests for this given the already existing 'hybrid_clock_error' metric didn't have any test coverage. Change-Id: I8575ba7d8baed78b13351e8cebf1a74f44b31b82 Reviewed-on: http://gerrit.cloudera.org:8080/15212 Tested-by: Kudu Jenkins Reviewed-by: Alexey Serbin <[email protected]> --- src/kudu/clock/hybrid_clock.cc | 60 +++++++++++++++++++++++++++++++++++------- src/kudu/clock/hybrid_clock.h | 28 +++++++++++++++++--- src/kudu/server/server_base.cc | 2 -- src/kudu/util/metrics.cc | 2 +- 4 files changed, 75 insertions(+), 17 deletions(-) diff --git a/src/kudu/clock/hybrid_clock.cc b/src/kudu/clock/hybrid_clock.cc index 4280eca..d2a94fc 100644 --- a/src/kudu/clock/hybrid_clock.cc +++ b/src/kudu/clock/hybrid_clock.cc @@ -117,16 +117,37 @@ bool ValidateTimeSource() { } GROUP_FLAG_VALIDATOR(time_source_guardrail, ValidateTimeSource); -METRIC_DEFINE_gauge_uint64(server, hybrid_clock_timestamp, - "Hybrid Clock Timestamp", - kudu::MetricUnit::kMicroseconds, - "Hybrid clock timestamp.", - kudu::MetricLevel::kInfo); +METRIC_DEFINE_gauge_bool(server, hybrid_clock_extrapolating, + "Hybrid Clock Is Being Extrapolated", + kudu::MetricUnit::kState, + "Whether HybridClock timestamps are extrapolated " + "because of inability to read the underlying clock", + kudu::MetricLevel::kWarn); METRIC_DEFINE_gauge_uint64(server, hybrid_clock_error, "Hybrid Clock Error", kudu::MetricUnit::kMicroseconds, - "Server clock maximum error.", + "Server clock maximum error", + kudu::MetricLevel::kInfo); +METRIC_DEFINE_gauge_uint64(server, hybrid_clock_timestamp, + "Hybrid Clock Timestamp", + kudu::MetricUnit::kMicroseconds, + "Hybrid clock timestamp", kudu::MetricLevel::kInfo); +METRIC_DEFINE_histogram(server, hybrid_clock_max_errors, + "Hybrid Clock Maximum Errors", + kudu::MetricUnit::kMilliseconds, + "The statistics on the maximum error of the underlying " + "clock", + kudu::MetricLevel::kInfo, + 20000, 3); +METRIC_DEFINE_histogram(server, hybrid_clock_extrapolation_intervals, + "Intervals of Hybrid Clock Extrapolation", + kudu::MetricUnit::kSeconds, + "The statistics on the duration of intervals when the " + "underlying clock was extrapolated instead of using " + "the direct readings", + kudu::MetricLevel::kWarn, + 10000, 3); namespace kudu { namespace clock { @@ -153,6 +174,12 @@ HybridClock::HybridClock(const scoped_refptr<MetricEntity>& metric_entity) : next_timestamp_(0), state_(kNotInitialized) { DCHECK(metric_entity); + max_errors_histogram_ = + METRIC_hybrid_clock_max_errors.Instantiate(metric_entity); + extrapolation_intervals_histogram_ = + METRIC_hybrid_clock_extrapolation_intervals.Instantiate(metric_entity); + extrapolating_ = metric_entity->FindOrCreateGauge( + &METRIC_hybrid_clock_extrapolating, false, MergeType::kMax); METRIC_hybrid_clock_timestamp.InstantiateFunctionGauge( metric_entity, Bind(&HybridClock::NowForMetrics, Unretained(this)))-> @@ -418,12 +445,13 @@ void HybridClock::NowWithErrorUnlocked(Timestamp *timestamp, } Status HybridClock::WalltimeWithError(uint64_t* now_usec, uint64_t* error_usec) { - bool is_extrapolated = false; auto read_time_before = MonoTime::Now(); Status s = time_service_->WalltimeWithError(now_usec, error_usec); auto read_time_after = MonoTime::Now(); + bool is_extrapolated = false; if (PREDICT_TRUE(s.ok())) { + max_errors_histogram_->Increment(*error_usec / 1000); // We got a good clock read. Remember this in case the clock later becomes // unsynchronized and we need to extrapolate from here. // @@ -448,6 +476,13 @@ Status HybridClock::WalltimeWithError(uint64_t* now_usec, uint64_t* error_usec) MonoDelta::FromMicroseconds(read_time_error_us); std::unique_lock<decltype(last_clock_read_lock_)> l(last_clock_read_lock_); + if (is_extrapolating_) { + is_extrapolating_ = false; + extrapolating_->set_value(is_extrapolating_); + uint64_t duration_us = (*now_usec > last_clock_read_physical_) + ? (*now_usec - last_clock_read_physical_) : 0; + extrapolation_intervals_histogram_->Increment(duration_us / 1000000); + } if (!last_clock_read_time_.Initialized() || last_clock_read_time_ < read_time_max_likelihood) { last_clock_read_time_ = read_time_max_likelihood; @@ -458,6 +493,10 @@ Status HybridClock::WalltimeWithError(uint64_t* now_usec, uint64_t* error_usec) // We failed to read the clock. Extrapolate the new time based on our // last successful read. std::unique_lock<decltype(last_clock_read_lock_)> l(last_clock_read_lock_); + if (!is_extrapolating_) { + is_extrapolating_ = true; + extrapolating_->set_value(is_extrapolating_); + } if (!last_clock_read_time_.Initialized()) { RETURN_NOT_OK_PREPEND(s, "could not read system time source"); } @@ -468,10 +507,11 @@ Status HybridClock::WalltimeWithError(uint64_t* now_usec, uint64_t* error_usec) *error_usec = last_clock_read_error_ + accum_error_us; is_extrapolated = true; l.unlock(); - // Log after unlocking to minimize the lock hold time. - KLOG_EVERY_N_SECS(ERROR, 1) << "Unable to read clock for last " - << time_since_last_read.ToString() << ": " << s.ToString(); + // Log after unlocking to minimize the lock hold time. + KLOG_EVERY_N_SECS(ERROR, 1) << Substitute( + "unable to read clock for last $0: $1", + time_since_last_read.ToString(), s.ToString()); } // If the clock is synchronized but has max_error beyond max_clock_sync_error_usec diff --git a/src/kudu/clock/hybrid_clock.h b/src/kudu/clock/hybrid_clock.h index 1589937..a00335e 100644 --- a/src/kudu/clock/hybrid_clock.h +++ b/src/kudu/clock/hybrid_clock.h @@ -197,16 +197,19 @@ class HybridClock : public Clock { simple_spinlock lock_; // The next timestamp to be generated from this clock, assuming that - // the physical clock hasn't advanced beyond the value stored here. Guarded - // by 'lock_'. + // the physical clock hasn't advanced beyond the value stored here. + // Protected by 'lock_'. uint64_t next_timestamp_; // The last valid clock reading we got from the time source, along - // with the monotime that we took that reading. - simple_spinlock last_clock_read_lock_; + // with the monotime that we took that reading. The 'is_extrapolating' field + // tracks whether extrapolated or real readings of the underlying clock are + // used to generate hybrid timestamps. + simple_spinlock last_clock_read_lock_; // protects four fields below MonoTime last_clock_read_time_; uint64_t last_clock_read_physical_; uint64_t last_clock_read_error_; + bool is_extrapolating_ = false; // The state of the object. Guarded by 'lock_'. enum State { @@ -215,6 +218,23 @@ class HybridClock : public Clock { }; State state_; + // Whether the hybrid clock is extrapolating the readings of the underlying + // clock instead of using the real ones. It's important to know whether + // the extrapolation is happening, but 'extrapolation_intervals_histogram_' + // metric doesn't allow for exposing this fact prior to the end of current + // extrapolation interval. + scoped_refptr<AtomicGauge<bool>> extrapolating_; + + // Stats on the underlying clock's 'maximum clock' metric sampled every + // NowWithError() call (essentially, every call when requesting a hybrid clock + // timestamp). + scoped_refptr<Histogram> max_errors_histogram_; + + // Stats on time intervals when the underlying clock was extrapolated + // instead of using the actual readings. Extrapolation happens when an attempt + // to read the clock yields an error (clock might be unsynchronized, etc.). + scoped_refptr<Histogram> extrapolation_intervals_histogram_; + // Clock metrics are set to detach to their last value. This means // that, during our destructor, we'll need to access other class members // declared above this. Hence, this member must be declared last. diff --git a/src/kudu/server/server_base.cc b/src/kudu/server/server_base.cc index a5bf5ad..36b5093 100644 --- a/src/kudu/server/server_base.cc +++ b/src/kudu/server/server_base.cc @@ -492,9 +492,7 @@ Status ServerBase::Init() { // so we're less likely to get into a partially initialized state on disk during startup // if we're having clock problems. RETURN_NOT_OK_PREPEND(clock_->Init(), "Cannot initialize clock"); - RETURN_NOT_OK(security::InitKerberosForServer(FLAGS_principal, FLAGS_keytab_file)); - RETURN_NOT_OK(file_cache_->Init()); fs::FsReport report; diff --git a/src/kudu/util/metrics.cc b/src/kudu/util/metrics.cc index 2ad5eed..d250387 100644 --- a/src/kudu/util/metrics.cc +++ b/src/kudu/util/metrics.cc @@ -897,7 +897,7 @@ HistogramPrototype::HistogramPrototype(const MetricPrototype::CtorArgs& args, : MetricPrototype(args), max_trackable_value_(max_trackable_value), num_sig_digits_(num_sig_digits) { - // Better to crash at definition time that at instantiation time. + // Better to crash at definition time than at instantiation time. CHECK(HdrHistogram::IsValidHighestTrackableValue(max_trackable_value)) << Substitute("Invalid max trackable value on histogram $0: $1", args.name_, max_trackable_value);
