This is an automated email from the ASF dual-hosted git repository. laiyingchun pushed a commit to tag kudu-1.12.0-mdh1.0.0-4c2c075-centos-release in repository https://gitbox.apache.org/repos/asf/kudu.git
commit 8cea6f4eb41967b9339b4d972b48effdfebc5631 Author: zhangyifan8 <[email protected]> AuthorDate: Tue Mar 24 17:41:57 2020 +0800 [collector] collect more for histogram metrics 1. collector mean/percentile_75/percentile_95 for histogram metrics. 2. fix merge rules for MeanGauge metrics. --- src/kudu/collector/metrics_collector-test.cc | 143 +++++++++++++++++++++++++++ src/kudu/collector/metrics_collector.cc | 24 ++++- src/kudu/collector/metrics_collector.h | 4 +- src/kudu/scripts/falcon_screen.json | 75 ++++++++++++++ 4 files changed, 241 insertions(+), 5 deletions(-) diff --git a/src/kudu/collector/metrics_collector-test.cc b/src/kudu/collector/metrics_collector-test.cc index 865f821..fa14062 100644 --- a/src/kudu/collector/metrics_collector-test.cc +++ b/src/kudu/collector/metrics_collector-test.cc @@ -392,6 +392,7 @@ TEST(TestMetricsCollector, TestParseMetrics) { {"server_metric", "COUNTER"}, {"metric_counter1", "COUNTER"}, {"metric_counter2", "COUNTER"}, + {"average_metric", "MEANGAUGE"}, {"server_metric_histogram", "HISTOGRAM"}, {"metric_histogram1", "HISTOGRAM"}, {"metric_histogram2", "HISTOGRAM"} @@ -442,6 +443,12 @@ TEST(TestMetricsCollector, TestParseMetrics) { R"*( "value": 20 )*" R"*( }, )*" R"*( { )*" + R"*( "name": "average_metric", )*" + R"*( "value": 1, )*" + R"*( "total_sum": 10, )*" + R"*( "total_count": 10 )*" + R"*( }, )*" + R"*( { )*" R"*( "name": "metric_histogram1", )*" R"*( "total_count": 17, )*" R"*( "min": 6, )*" @@ -469,6 +476,12 @@ TEST(TestMetricsCollector, TestParseMetrics) { R"*( "value": 100 )*" R"*( }, )*" R"*( { )*" + R"*( "name": "average_metric", )*" + R"*( "value": 0.5, )*" + R"*( "total_sum": 20, )*" + R"*( "total_count": 20 )*" + R"*( }, )*" + R"*( { )*" R"*( "name": "metric_histogram1", )*" R"*( "total_count": 170, )*" R"*( "min": 60, )*" @@ -526,6 +539,30 @@ TEST(TestMetricsCollector, TestParseMetrics) { "table1", { { + "average_metric", + { + {10, 1} + } + }, + { + "metric_histogram1_mean", + { + {17, 47.8235} + } + }, + { + "metric_histogram1_percentile_75", + { + {17, 62} + } + }, + { + "metric_histogram1_percentile_95", + { + {17, 72} + } + }, + { "metric_histogram1_percentile_99", { {17, 73} @@ -537,12 +574,54 @@ TEST(TestMetricsCollector, TestParseMetrics) { "table2", { { + "average_metric", + { + {20, 0.5} + } + }, + { + "metric_histogram1_mean", + { + {170, 478.235} + } + }, + { + "metric_histogram1_percentile_75", + { + {170, 620} + } + }, + { + "metric_histogram1_percentile_95", + { + {170, 720} + } + }, + { "metric_histogram1_percentile_99", { {170, 730} } }, { + "metric_histogram2_mean", + { + {34, 47.8235} + } + }, + { + "metric_histogram2_percentile_75", + { + {34, 62} + } + }, + { + "metric_histogram2_percentile_95", + { + {34, 72} + } + }, + { "metric_histogram2_percentile_99", { {34, 72} @@ -558,6 +637,34 @@ TEST(TestMetricsCollector, TestParseMetrics) { })); ASSERT_EQ(host_hist_metrics, MetricsCollector::HistMetrics({ { + "average_metric", + { + {10, 1}, + {20, 0.5} + } + }, + { + "metric_histogram1_mean", + { + {17, 47.8235}, + {170, 478.235} + } + }, + { + "metric_histogram1_percentile_75", + { + {17, 62}, + {170, 620} + } + }, + { + "metric_histogram1_percentile_95", + { + {17, 72}, + {170, 720} + } + }, + { "metric_histogram1_percentile_99", { {17, 73}, @@ -565,12 +672,48 @@ TEST(TestMetricsCollector, TestParseMetrics) { } }, { + "metric_histogram2_mean", + { + {34, 47.8235} + } + }, + { + "metric_histogram2_percentile_75", + { + {34, 62} + } + }, + { + "metric_histogram2_percentile_95", + { + {34, 72} + } + }, + { "metric_histogram2_percentile_99", { {34, 72} } }, { + "server_metric_histogram_mean", + { + {60, 76.16666666666667} + } + }, + { + "server_metric_histogram_percentile_75", + { + {60, 25} + } + }, + { + "server_metric_histogram_percentile_95", + { + {60, 66} + } + }, + { "server_metric_histogram_percentile_99", { {60, 79} diff --git a/src/kudu/collector/metrics_collector.cc b/src/kudu/collector/metrics_collector.cc index fc9f20a..b170422 100644 --- a/src/kudu/collector/metrics_collector.cc +++ b/src/kudu/collector/metrics_collector.cc @@ -39,6 +39,7 @@ #include "kudu/gutil/port.h" #include "kudu/gutil/strings/split.h" #include "kudu/gutil/strings/substitute.h" +#include "kudu/gutil/strings/util.h" #include "kudu/gutil/walltime.h" #include "kudu/util/curl_util.h" #include "kudu/util/debug/trace_event.h" @@ -85,7 +86,8 @@ using strings::Substitute; namespace kudu { namespace collector { -const set<string> MetricsCollector::kRegisterPercentiles = {"percentile_99"}; +const set<string> MetricsCollector::kRegisterPercentiles = + {"mean", "percentile_75", "percentile_95", "percentile_99"}; MetricsCollector::MetricsCollector(scoped_refptr<NodesChecker> nodes_checker, scoped_refptr<ReporterBase> reporter) @@ -244,6 +246,10 @@ Status MetricsCollector::ExtractMetricTypes(const JsonReader& r, for (const Value* metric : metrics) { string name; RETURN_NOT_OK(r.ExtractString(metric, "name", &name)); + if (HasPrefixString(name, "average_")) { + EmplaceOrDie(metric_types, std::make_pair(name, "MEANGAUGE")); + continue; + } string type; RETURN_NOT_OK(r.ExtractString(metric, "type", &type)); string upper_type; @@ -656,14 +662,26 @@ Status MetricsCollector::ParseEntityMetrics(const JsonReader& r, auto& found_metric = FindOrDie(*merged_kv_metrics, name); found_metric += value; } + } else if (*known_type == "MEANGAUGE") { + double total_count; + CHECK_OK(r.ExtractDouble(metric, "total_count", &total_count)); + double value; + CHECK_OK(r.ExtractDouble(metric, "value", &value)); + vector<SimpleHistogram> tmp({{static_cast<int64_t>(total_count), value}}); + EmplaceOrDie(hist_metrics, std::make_pair(name, tmp)); + if (merged_hist_metrics && + !EmplaceIfNotPresent(merged_hist_metrics, std::make_pair(name, tmp))) { + auto& found_hist_metric = FindOrDie(*merged_hist_metrics, name); + found_hist_metric.emplace_back(tmp[0]); + } } else if (*known_type == "HISTOGRAM") { for (const auto& percentile : kRegisterPercentiles) { string hist_metric_name(name); hist_metric_name += "_" + percentile; int64_t total_count; CHECK_OK(r.ExtractInt64(metric, "total_count", &total_count)); - int64_t percentile_value; - CHECK_OK(r.ExtractInt64(metric, percentile.c_str(), &percentile_value)); + double percentile_value; + CHECK_OK(r.ExtractDouble(metric, percentile.c_str(), &percentile_value)); vector<SimpleHistogram> tmp({{total_count, percentile_value}}); EmplaceOrDie(hist_metrics, std::make_pair(hist_metric_name, tmp)); if (merged_hist_metrics && diff --git a/src/kudu/collector/metrics_collector.h b/src/kudu/collector/metrics_collector.h index 2401030..5c90002 100644 --- a/src/kudu/collector/metrics_collector.h +++ b/src/kudu/collector/metrics_collector.h @@ -87,8 +87,8 @@ class MetricsCollector : public RefCounted<MetricsCollector> { int64_t count; // 'percentile_xxx" value in histogram metric, percentile_xxx is specified // by kRegisterPercentiles. - int64_t value; - SimpleHistogram(int64_t c, int64_t v) : count(c), value(v) { + double value; + SimpleHistogram(int64_t c, double v) : count(c), value(v) { } inline bool operator==(const SimpleHistogram& rhs) const { return count == rhs.count && value == rhs.value; diff --git a/src/kudu/scripts/falcon_screen.json b/src/kudu/scripts/falcon_screen.json index e3ae54c..a7046b1 100644 --- a/src/kudu/scripts/falcon_screen.json +++ b/src/kudu/scripts/falcon_screen.json @@ -113,27 +113,51 @@ "metric=all_transactions_inflight service=kudu cluster=${cluster.name} level=${level} v=4", "metric=alter_schema_transactions_inflight service=kudu cluster=${cluster.name} level=${level} v=4", "metric=average_diskrowset_height service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=bloom_lookups_per_op_mean service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=bloom_lookups_per_op_percentile_75 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=bloom_lookups_per_op_percentile_95 service=kudu cluster=${cluster.name} level=${level} v=4", "metric=bloom_lookups_per_op_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", "metric=bloom_lookups service=kudu cluster=${cluster.name} level=${level} v=4", "metric=bytes_flushed service=kudu cluster=${cluster.name} level=${level} v=4" ], "table_c": [ + "metric=commit_wait_duration_mean service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=commit_wait_duration_percentile_75 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=commit_wait_duration_percentile_95 service=kudu cluster=${cluster.name} level=${level} v=4", "metric=commit_wait_duration_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=compact_rs_duration_mean service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=compact_rs_duration_percentile_75 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=compact_rs_duration_percentile_95 service=kudu cluster=${cluster.name} level=${level} v=4", "metric=compact_rs_duration_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", "metric=compact_rs_running service=kudu cluster=${cluster.name} level=${level} v=4" ], "table_d": [ "metric=delta_file_lookups service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=delta_file_lookups_per_op_mean service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=delta_file_lookups_per_op_percentile_75 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=delta_file_lookups_per_op_percentile_95 service=kudu cluster=${cluster.name} level=${level} v=4", "metric=delta_file_lookups_per_op_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=delta_major_compact_rs_duration_mean service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=delta_major_compact_rs_duration_percentile_75 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=delta_major_compact_rs_duration_percentile_95 service=kudu cluster=${cluster.name} level=${level} v=4", "metric=delta_major_compact_rs_duration_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", "metric=delta_major_compact_rs_running service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=delta_minor_compact_rs_duration_mean service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=delta_minor_compact_rs_duration_percentile_75 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=delta_minor_compact_rs_duration_percentile_95 service=kudu cluster=${cluster.name} level=${level} v=4", "metric=delta_minor_compact_rs_duration_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", "metric=delta_minor_compact_rs_running service=kudu cluster=${cluster.name} level=${level} v=4" ], "table_f": [ "metric=failed_elections_since_stable_leader service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=flush_dms_duration_mean service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=flush_dms_duration_percentile_75 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=flush_dms_duration_percentile_95 service=kudu cluster=${cluster.name} level=${level} v=4", "metric=flush_dms_duration_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", "metric=flush_dms_running service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=flush_mrs_duration_mean service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=flush_mrs_duration_percentile_75 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=flush_mrs_duration_percentile_95 service=kudu cluster=${cluster.name} level=${level} v=4", "metric=flush_mrs_duration_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", "metric=flush_mrs_running service=kudu cluster=${cluster.name} level=${level} v=4", "metric=follower_memory_pressure_rejections service=kudu cluster=${cluster.name} level=${level} v=4" @@ -142,23 +166,47 @@ "metric=in_progress_ops service=kudu cluster=${cluster.name} level=${level} v=4", "metric=insertions_failed_dup_key service=kudu cluster=${cluster.name} level=${level} v=4", "metric=key_file_lookups service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=key_file_lookups_per_op_mean service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=key_file_lookups_per_op_percentile_75 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=key_file_lookups_per_op_percentile_95 service=kudu cluster=${cluster.name} level=${level} v=4", "metric=key_file_lookups_per_op_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", "metric=kudu-table-health service=kudu cluster=${cluster.name} level=${level} v=4" ], "table_l": [ "metric=leader_memory_pressure_rejections service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=log_append_latency_mean service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=log_append_latency_percentile_75 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=log_append_latency_percentile_95 service=kudu cluster=${cluster.name} level=${level} v=4", "metric=log_append_latency_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", "metric=log_bytes_logged service=kudu cluster=${cluster.name} level=${level} v=4", "metric=log_cache_num_ops service=kudu cluster=${cluster.name} level=${level} v=4", "metric=log_cache_size service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=log_entry_batches_per_group_mean service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=log_entry_batches_per_group_percentile_75 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=log_entry_batches_per_group_percentile_95 service=kudu cluster=${cluster.name} level=${level} v=4", "metric=log_entry_batches_per_group_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=log_gc_duration_mean service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=log_gc_duration_percentile_75 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=log_gc_duration_percentile_95 service=kudu cluster=${cluster.name} level=${level} v=4", "metric=log_gc_duration_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", "metric=log_gc_running service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=log_group_commit_latency_mean service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=log_group_commit_latency_percentile_75 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=log_group_commit_latency_percentile_95 service=kudu cluster=${cluster.name} level=${level} v=4", "metric=log_group_commit_latency_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", "metric=log_reader_bytes_read service=kudu cluster=${cluster.name} level=${level} v=4", "metric=log_reader_entries_read service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=log_reader_read_batch_latency_mean service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=log_reader_read_batch_latency_percentile_75 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=log_reader_read_batch_latency_percentile_95 service=kudu cluster=${cluster.name} level=${level} v=4", "metric=log_reader_read_batch_latency_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=log_roll_latency_mean service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=log_roll_latency_percentile_75 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=log_roll_latency_percentile_95 service=kudu cluster=${cluster.name} level=${level} v=4", "metric=log_roll_latency_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=log_sync_latency_mean service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=log_sync_latency_percentile_75 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=log_sync_latency_percentile_95 service=kudu cluster=${cluster.name} level=${level} v=4", "metric=log_sync_latency_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", "metric=lth service=kudu cluster=${cluster.name} level=${level} v=4" ], @@ -171,8 +219,17 @@ "table_o": [ "metric=on_disk_data_size service=kudu cluster=${cluster.name} level=${level} v=4", "metric=on_disk_size service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=op_prepare_queue_length_mean service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=op_prepare_queue_length_percentile_75 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=op_prepare_queue_length_percentile_95 service=kudu cluster=${cluster.name} level=${level} v=4", "metric=op_prepare_queue_length_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=op_prepare_queue_time_mean service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=op_prepare_queue_time_percentile_75 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=op_prepare_queue_time_percentile_95 service=kudu cluster=${cluster.name} level=${level} v=4", "metric=op_prepare_queue_time_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=op_prepare_run_time_mean service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=op_prepare_run_time_percentile_75 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=op_prepare_run_time_percentile_95 service=kudu cluster=${cluster.name} level=${level} v=4", "metric=op_prepare_run_time_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", "metric=ops_behind_leader service=kudu cluster=${cluster.name} level=${level} v=4" ], @@ -192,14 +249,26 @@ "metric=scanner_rows_returned service=kudu cluster=${cluster.name} level=${level} v=4", "metric=scanner_rows_scanned service=kudu cluster=${cluster.name} level=${level} v=4", "metric=scans_started service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=snapshot_read_inflight_wait_duration_mean service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=snapshot_read_inflight_wait_duration_percentile_75 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=snapshot_read_inflight_wait_duration_percentile_95 service=kudu cluster=${cluster.name} level=${level} v=4", "metric=snapshot_read_inflight_wait_duration_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", "metric=state service=kudu cluster=${cluster.name} level=${level} v=4" ], "table_u": [ "metric=undo_delta_block_estimated_retained_bytes service=kudu cluster=${cluster.name} level=${level} v=4", "metric=undo_delta_block_gc_bytes_deleted service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=undo_delta_block_gc_delete_duration_mean service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=undo_delta_block_gc_delete_duration_percentile_75 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=undo_delta_block_gc_delete_duration_percentile_95 service=kudu cluster=${cluster.name} level=${level} v=4", "metric=undo_delta_block_gc_delete_duration_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=undo_delta_block_gc_init_duration_mean service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=undo_delta_block_gc_init_duration_percentile_75 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=undo_delta_block_gc_init_duration_percentile_95 service=kudu cluster=${cluster.name} level=${level} v=4", "metric=undo_delta_block_gc_init_duration_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=undo_delta_block_gc_perform_duration_mean service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=undo_delta_block_gc_perform_duration_percentile_75 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=undo_delta_block_gc_perform_duration_percentile_95 service=kudu cluster=${cluster.name} level=${level} v=4", "metric=undo_delta_block_gc_perform_duration_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", "metric=undo_delta_block_gc_running service=kudu cluster=${cluster.name} level=${level} v=4", "metric=upserts_as_updates service=kudu cluster=${cluster.name} level=${level} v=4" @@ -208,7 +277,13 @@ "metric=tablet_active_scanners service=kudu cluster=${cluster.name} level=${level} v=4", "metric=time_since_last_leader_heartbeat service=kudu cluster=${cluster.name} level=${level} v=4", "metric=transaction_memory_pressure_rejections service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=write_op_duration_client_propagated_consistency_mean service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=write_op_duration_client_propagated_consistency_percentile_75 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=write_op_duration_client_propagated_consistency_percentile_95 service=kudu cluster=${cluster.name} level=${level} v=4", "metric=write_op_duration_client_propagated_consistency_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=write_op_duration_commit_wait_consistency_mean service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=write_op_duration_commit_wait_consistency_percentile_75 service=kudu cluster=${cluster.name} level=${level} v=4", + "metric=write_op_duration_commit_wait_consistency_percentile_95 service=kudu cluster=${cluster.name} level=${level} v=4", "metric=write_op_duration_commit_wait_consistency_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4", "metric=write_transactions_inflight service=kudu cluster=${cluster.name} level=${level} v=4" ],
