IMPALA-4187: Switch RPC latency metrics to histograms It's usually better to measure latency distributions with histograms, not avg / min / max. This change switches out the metrics that measure the latency of RPC processing time to histograms.
Add HistogramMetric::Reset() to allow histogram to remove all entries. Added spinlock around histogram access. On a 8-core i7 @ 3.4GHz, the following throughputs were observed: 1 thread -> 25M updates/sec 4 threads -> 7M updates/sec 16 threads -> 5M updates/sec Each histogram takes ~108KB of storage for its buckets. This can be reduced by reducing the maximum value, currently 60 minutes. The new metrics have the following text representation: Count: 148, 25th %-ile: 0, 50th %-ile: 0, 75th %-ile: 0, 90th %-ile: 0, 95th %-ile: 0, 99.9th %-ile: 1ms Which changes from the old: count: 345, last: 6ms, min: 0, max: 12ms, mean: 1ms, stddev: 1ms Change-Id: I9ba6d4270dd5676eeeff35ad8d9dc5dcddd95e34 Reviewed-on: http://gerrit.cloudera.org:8080/4516 Reviewed-by: Henry Robinson <[email protected]> Tested-by: Internal Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/872d5462 Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/872d5462 Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/872d5462 Branch: refs/heads/master Commit: 872d5462b9a17d9f2d9edeb08728e29d6eb6eca6 Parents: 435aec5 Author: Henry Robinson <[email protected]> Authored: Wed Sep 21 16:29:57 2016 -0700 Committer: Internal Jenkins <[email protected]> Committed: Mon Sep 26 22:07:17 2016 +0000 ---------------------------------------------------------------------- be/src/rpc/rpc-trace.cc | 22 +++++++---- be/src/rpc/rpc-trace.h | 6 +-- be/src/util/histogram-metric.h | 73 ++++++++++++++++++++++++------------- common/thrift/metrics.json | 2 +- 4 files changed, 65 insertions(+), 38 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/872d5462/be/src/rpc/rpc-trace.cc ---------------------------------------------------------------------- diff --git a/be/src/rpc/rpc-trace.cc b/be/src/rpc/rpc-trace.cc index 407c570..f4da19b 100644 --- a/be/src/rpc/rpc-trace.cc +++ b/be/src/rpc/rpc-trace.cc @@ -33,7 +33,7 @@ using namespace rapidjson; using namespace strings; // Metric key format for rpc call duration metrics. -const string RPC_TIME_STATS_METRIC_KEY = "rpc-method.$0.call_duration"; +const string RPC_PROCESSING_TIME_DISTRIBUTION_METRIC_KEY = "rpc-method.$0.call_duration"; // Singleton class to keep track of all RpcEventHandlers, and to render them to a // web-based summary page. @@ -123,14 +123,14 @@ void RpcEventHandler::Reset(const string& method_name) { lock_guard<mutex> l(method_map_lock_); MethodMap::iterator it = method_map_.find(method_name); if (it == method_map_.end()) return; - it->second->time_stats->Reset(); + it->second->processing_time_distribution->Reset(); it->second->num_in_flight.Store(0L); } void RpcEventHandler::ResetAll() { lock_guard<mutex> l(method_map_lock_); for (const MethodMap::value_type& method: method_map_) { - method.second->time_stats->Reset(); + method.second->processing_time_distribution->Reset(); method.second->num_in_flight.Store(0L); } } @@ -149,7 +149,8 @@ void RpcEventHandler::ToJson(Value* server, Document* document) { Value method(kObjectType); Value method_name(rpc.first.c_str(), document->GetAllocator()); method.AddMember("name", method_name, document->GetAllocator()); - const string& human_readable = rpc.second->time_stats->ToHumanReadable(); + const string& human_readable = + rpc.second->processing_time_distribution->ToHumanReadable(); Value summary(human_readable.c_str(), document->GetAllocator()); method.AddMember("summary", summary, document->GetAllocator()); method.AddMember("in_flight", rpc.second->num_in_flight.Load(), @@ -171,9 +172,14 @@ void* RpcEventHandler::getContext(const char* fn_name, void* server_context) { if (it == method_map_.end()) { MethodDescriptor* descriptor = new MethodDescriptor(); descriptor->name = fn_name; - const string& rpc_name = Substitute("$0.$1", server_name_, descriptor->name); - descriptor->time_stats = StatsMetric<double>::CreateAndRegister(metrics_, - RPC_TIME_STATS_METRIC_KEY, rpc_name); + const string& rpc_name = Substitute(RPC_PROCESSING_TIME_DISTRIBUTION_METRIC_KEY, + Substitute("$0.$1", server_name_, descriptor->name)); + const TMetricDef& def = + MakeTMetricDef(rpc_name, TMetricKind::HISTOGRAM, TUnit::TIME_MS); + constexpr int32_t SIXTY_MINUTES_IN_MS = 60 * 1000 * 60; + // Store processing times of up to 60 minutes with 3 sig. fig. + descriptor->processing_time_distribution = + metrics_->RegisterMetric(new HistogramMetric(def, SIXTY_MINUTES_IN_MS, 3)); it = method_map_.insert(make_pair(descriptor->name, descriptor)).first; } } @@ -197,5 +203,5 @@ void RpcEventHandler::postWrite(void* ctx, const char* fn_name, uint32_t bytes) MethodDescriptor* descriptor = rpc_ctx->method_descriptor; delete rpc_ctx; descriptor->num_in_flight.Add(-1); - descriptor->time_stats->Update(elapsed_time); + descriptor->processing_time_distribution->Update(elapsed_time); } http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/872d5462/be/src/rpc/rpc-trace.h ---------------------------------------------------------------------- diff --git a/be/src/rpc/rpc-trace.h b/be/src/rpc/rpc-trace.h index 3ee2c6e..b2861af 100644 --- a/be/src/rpc/rpc-trace.h +++ b/be/src/rpc/rpc-trace.h @@ -18,8 +18,8 @@ #ifndef IMPALA_RPC_RPC_TRACE_H #define IMPALA_RPC_RPC_TRACE_H -#include "util/collection-metrics.h" #include "rpc/thrift-server.h" +#include "util/histogram-metric.h" #include "util/internal-queue.h" #include <thrift/TProcessor.h> @@ -83,8 +83,8 @@ class RpcEventHandler : public apache::thrift::TProcessorEventHandler { /// Name of the method std::string name; - /// Summary statistics for the time taken to respond to this method - StatsMetric<double>* time_stats; + /// Distribution of the time taken to process this RPC. + HistogramMetric* processing_time_distribution; /// Number of invocations in flight AtomicInt32 num_in_flight; http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/872d5462/be/src/util/histogram-metric.h ---------------------------------------------------------------------- diff --git a/be/src/util/histogram-metric.h b/be/src/util/histogram-metric.h index 211deee..5a61648 100644 --- a/be/src/util/histogram-metric.h +++ b/be/src/util/histogram-metric.h @@ -20,10 +20,12 @@ #include "util/hdr-histogram.h" #include "util/metrics.h" +#include "util/spinlock.h" namespace impala { /// Metric which constructs (using HdrHistogram) a histogram of a set of values. +/// Thread-safe: histogram access protected by a spin lock. class HistogramMetric : public Metric { public: /// Constructs a new histogram metric. `highest_trackable_value` is the maximum value @@ -31,27 +33,33 @@ class HistogramMetric : public Metric { /// that values must be stored with. HistogramMetric( const TMetricDef& def, uint64_t highest_trackable_value, int num_significant_digits) - : Metric(def), histogram_(highest_trackable_value, num_significant_digits), - unit_(def.units) { } + : Metric(def), + histogram_(new HdrHistogram(highest_trackable_value, num_significant_digits)), + unit_(def.units) { + DCHECK_EQ(TMetricKind::HISTOGRAM, def.kind); + } virtual void ToJson(rapidjson::Document* document, rapidjson::Value* value) { rapidjson::Value container(rapidjson::kObjectType); AddStandardFields(document, &container); - container.AddMember("25th %-ile", histogram_.ValueAtPercentile(25), - document->GetAllocator()); - container.AddMember("50th %-ile", histogram_.ValueAtPercentile(50), - document->GetAllocator()); - container.AddMember("75th %-ile", histogram_.ValueAtPercentile(75), - document->GetAllocator()); - container.AddMember("90th %-ile", histogram_.ValueAtPercentile(90), - document->GetAllocator()); - container.AddMember("95th %-ile", histogram_.ValueAtPercentile(95), - document->GetAllocator()); - container.AddMember("99.9th %-ile", histogram_.ValueAtPercentile(99.9), - document->GetAllocator()); - container.AddMember("count", histogram_.TotalCount(), document->GetAllocator()); - + { + boost::lock_guard<SpinLock> l(lock_); + + container.AddMember( + "25th %-ile", histogram_->ValueAtPercentile(25), document->GetAllocator()); + container.AddMember( + "50th %-ile", histogram_->ValueAtPercentile(50), document->GetAllocator()); + container.AddMember( + "75th %-ile", histogram_->ValueAtPercentile(75), document->GetAllocator()); + container.AddMember( + "90th %-ile", histogram_->ValueAtPercentile(90), document->GetAllocator()); + container.AddMember( + "95th %-ile", histogram_->ValueAtPercentile(95), document->GetAllocator()); + container.AddMember( + "99.9th %-ile", histogram_->ValueAtPercentile(99.9), document->GetAllocator()); + container.AddMember("count", histogram_->TotalCount(), document->GetAllocator()); + } rapidjson::Value type_value(PrintTMetricKind(TMetricKind::HISTOGRAM).c_str(), document->GetAllocator()); container.AddMember("kind", type_value, document->GetAllocator()); @@ -61,33 +69,46 @@ class HistogramMetric : public Metric { *value = container; } - void Update(int64_t val) { histogram_.Increment(val); } + void Update(int64_t val) { + boost::lock_guard<SpinLock> l(lock_); + histogram_->Increment(val); + } + + /// Reset the histogram by removing all previous entries. + void Reset() { + boost::lock_guard<SpinLock> l(lock_); + uint64_t highest = histogram_->highest_trackable_value(); + int digits = histogram_->num_significant_digits(); + histogram_.reset(new HdrHistogram(highest, digits)); + } virtual void ToLegacyJson(rapidjson::Document*) { } const TUnit::type& unit() const { return unit_; } virtual std::string ToHumanReadable() { + boost::lock_guard<SpinLock> l(lock_); std::stringstream out; - out <<"Count: " << histogram_.TotalCount() << ", " + out << "Count: " << histogram_->TotalCount() << ", " << "25th %-ile: " - << PrettyPrinter::Print(histogram_.ValueAtPercentile(25), unit_) << ", " + << PrettyPrinter::Print(histogram_->ValueAtPercentile(25), unit_) << ", " << "50th %-ile: " - << PrettyPrinter::Print(histogram_.ValueAtPercentile(50), unit_) << ", " + << PrettyPrinter::Print(histogram_->ValueAtPercentile(50), unit_) << ", " << "75th %-ile: " - << PrettyPrinter::Print(histogram_.ValueAtPercentile(75), unit_) << ", " + << PrettyPrinter::Print(histogram_->ValueAtPercentile(75), unit_) << ", " << "90th %-ile: " - << PrettyPrinter::Print(histogram_.ValueAtPercentile(90), unit_) << ", " + << PrettyPrinter::Print(histogram_->ValueAtPercentile(90), unit_) << ", " << "95th %-ile: " - << PrettyPrinter::Print(histogram_.ValueAtPercentile(95), unit_) << ", " + << PrettyPrinter::Print(histogram_->ValueAtPercentile(95), unit_) << ", " << "99.9th %-ile: " - << PrettyPrinter::Print(histogram_.ValueAtPercentile(99.9), unit_); + << PrettyPrinter::Print(histogram_->ValueAtPercentile(99.9), unit_); return out.str(); } private: - HdrHistogram histogram_; - + // Protects histogram_ pointer itself. + SpinLock lock_; + boost::scoped_ptr<HdrHistogram> histogram_; const TUnit::type unit_; }; http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/872d5462/common/thrift/metrics.json ---------------------------------------------------------------------- diff --git a/common/thrift/metrics.json b/common/thrift/metrics.json index b600f80..70dbb12 100644 --- a/common/thrift/metrics.json +++ b/common/thrift/metrics.json @@ -748,7 +748,7 @@ ], "label": "$0 RPC Call Duration", "units": "TIME_MS", - "kind": "STATS", + "kind": "HISTOGRAM", "key": "rpc-method.$0.call_duration" }, {
