This is an automated email from the ASF dual-hosted git repository.
alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git
The following commit(s) were added to refs/heads/master by this push:
new 2880d3a [util] add a few new metrics in MaintenanceManager
2880d3a is described below
commit 2880d3a0dc0fc07d20fab79f1d0df3a6a622e398
Author: Alexey Serbin <[email protected]>
AuthorDate: Sun Jan 10 19:57:34 2021 -0800
[util] add a few new metrics in MaintenanceManager
This patch adds a couple of metrics for MaintenanceManager to track the
duration of choosing the best candidate among available maintenance
operations and number of times the Prepare() method for a maintenance
operation failed:
* maintenance_op_find_best_candidate_duration
* maintenance_op_prepare_failed
In addition, it adds SCOPED_LOG_SLOW_EXECUTION with the threshold of
10 seconds into the MaintenanceManager::FindBestOp() method.
At this point, I manually verified that those metrics are present and
show relevant information. I'm planning to add an automated test
to cover the behavior of these new metrics in [1] to have less conflicts
with the mentioned patch.
The motivation for this change is a finding that FindBestOp()'s
computational complexity is O(n^2) of the number of replicas per tablet
server (each tablet replica registers about 8 maintenance operations).
Also, BudgetedCompactionPolicy::RunApproximation()'s computational
complexity is O(n^2) of the number of rowset in max and min keys.
In the wild, there was an instance of a Kudu cluster with high data
ingest ratio with the following stack showing in every snapshot in the
diagnostic logs for many hours in a row:
0xa11735 kudu::tablet::BudgetedCompactionPolicy::RunApproximation()
0xa129c9 kudu::tablet::BudgetedCompactionPolicy::PickRowSets()
0x9c8d80 kudu::tablet::Tablet::UpdateCompactionStats()
0x9ec848 kudu::tablet::CompactRowSetsOp::UpdateStats()
0x1b3de5c kudu::MaintenanceManager::FindBestOp()
0x1b3f3c5 kudu::MaintenanceManager::RunSchedulerThread()
0x1b86014 kudu::Thread::SuperviseThread()
[1] https://gerrit.cloudera.org/#/c/16937/
Change-Id: If5420afd605f9bd22207af142b49e73336907486
Reviewed-on: http://gerrit.cloudera.org:8080/16938
Reviewed-by: Andrew Wong <[email protected]>
Tested-by: Kudu Jenkins
---
src/kudu/master/master.cc | 4 +-
src/kudu/tserver/tablet_server.cc | 2 +-
src/kudu/util/CMakeLists.txt | 1 +
src/kudu/util/maintenance_manager-test.cc | 12 +++++-
src/kudu/util/maintenance_manager.cc | 16 ++++++--
src/kudu/util/maintenance_manager.h | 12 ++++--
src/kudu/util/maintenance_manager_metrics.cc | 58 ++++++++++++++++++++++++++++
src/kudu/util/maintenance_manager_metrics.h | 46 ++++++++++++++++++++++
8 files changed, 141 insertions(+), 10 deletions(-)
diff --git a/src/kudu/master/master.cc b/src/kudu/master/master.cc
index 4f693d9..45a8837 100644
--- a/src/kudu/master/master.cc
+++ b/src/kudu/master/master.cc
@@ -200,7 +200,9 @@ Status Master::Init() {
}
maintenance_manager_.reset(new MaintenanceManager(
- MaintenanceManager::kDefaultOptions, fs_manager_->uuid()));
+ MaintenanceManager::kDefaultOptions,
+ fs_manager_->uuid(),
+ metric_entity()));
// The certificate authority object is initialized upon loading
// CA private key and certificate from the system table when the server
diff --git a/src/kudu/tserver/tablet_server.cc
b/src/kudu/tserver/tablet_server.cc
index c8b87f3..6aa59ff 100644
--- a/src/kudu/tserver/tablet_server.cc
+++ b/src/kudu/tserver/tablet_server.cc
@@ -100,7 +100,7 @@ Status TabletServer::Init() {
}
maintenance_manager_ = std::make_shared<MaintenanceManager>(
- MaintenanceManager::kDefaultOptions, fs_manager_->uuid());
+ MaintenanceManager::kDefaultOptions, fs_manager_->uuid(),
metric_entity());
heartbeater_.reset(new Heartbeater(std::move(master_addrs), this));
diff --git a/src/kudu/util/CMakeLists.txt b/src/kudu/util/CMakeLists.txt
index b340f04..fc5955b 100644
--- a/src/kudu/util/CMakeLists.txt
+++ b/src/kudu/util/CMakeLists.txt
@@ -197,6 +197,7 @@ set(UTIL_SRCS
locks.cc
logging.cc
maintenance_manager.cc
+ maintenance_manager_metrics.cc
malloc.cc
memcmpable_varint.cc
memory/arena.cc
diff --git a/src/kudu/util/maintenance_manager-test.cc
b/src/kudu/util/maintenance_manager-test.cc
index e43e60d..99a8b39 100644
--- a/src/kudu/util/maintenance_manager-test.cc
+++ b/src/kudu/util/maintenance_manager-test.cc
@@ -75,6 +75,11 @@ static const char kFakeUuid[] = "12345";
class MaintenanceManagerTest : public KuduTest {
public:
+ MaintenanceManagerTest()
+ : metric_entity_(METRIC_ENTITY_server.Instantiate(
+ &metric_registry_, "test_entity")) {
+ }
+
void SetUp() override {
StartManager(2);
}
@@ -88,9 +93,9 @@ class MaintenanceManagerTest : public KuduTest {
options.num_threads = num_threads;
options.polling_interval_ms = 1;
options.history_size = kHistorySize;
- manager_.reset(new MaintenanceManager(options, kFakeUuid));
+ manager_.reset(new MaintenanceManager(options, kFakeUuid, metric_entity_));
manager_->set_memory_pressure_func_for_tests(
- [&](double* consumption) {
+ [&](double* /* consumption */) {
return indicate_memory_pressure_.load();
});
ASSERT_OK(manager_->Start());
@@ -101,6 +106,9 @@ class MaintenanceManagerTest : public KuduTest {
}
protected:
+ MetricRegistry metric_registry_;
+ scoped_refptr<MetricEntity> metric_entity_;
+
shared_ptr<MaintenanceManager> manager_;
std::atomic<bool> indicate_memory_pressure_ { false };
};
diff --git a/src/kudu/util/maintenance_manager.cc
b/src/kudu/util/maintenance_manager.cc
index 75ffd8b..64a361a 100644
--- a/src/kudu/util/maintenance_manager.cc
+++ b/src/kudu/util/maintenance_manager.cc
@@ -171,8 +171,10 @@ const MaintenanceManager::Options
MaintenanceManager::kDefaultOptions = {
.history_size = 0,
};
-MaintenanceManager::MaintenanceManager(const Options& options,
- string server_uuid)
+MaintenanceManager::MaintenanceManager(
+ const Options& options,
+ string server_uuid,
+ const scoped_refptr<MetricEntity>& metric_entity)
: server_uuid_(std::move(server_uuid)),
num_threads_(options.num_threads > 0
? options.num_threads
@@ -186,7 +188,8 @@ MaintenanceManager::MaintenanceManager(const Options&
options,
running_ops_(0),
completed_ops_count_(0),
rand_(GetRandomSeed32()),
- memory_pressure_func_(&process_memory::UnderMemoryPressure) {
+ memory_pressure_func_(&process_memory::UnderMemoryPressure),
+ metrics_(CHECK_NOTNULL(metric_entity)) {
CHECK_OK(ThreadPoolBuilder("MaintenanceMgr")
.set_min_threads(num_threads_)
.set_max_threads(num_threads_)
@@ -361,6 +364,7 @@ void MaintenanceManager::RunSchedulerThread() {
if (!op->Prepare()) {
LOG_WITH_PREFIX(INFO) << "Prepare failed for " << op->name()
<< ". Re-running scheduler.";
+ metrics_.SubmitOpPrepareFailed();
std::lock_guard<Mutex> guard(running_instances_lock_);
DecreaseOpCountAndNotifyWaiters(op);
continue;
@@ -392,12 +396,18 @@ void MaintenanceManager::RunSchedulerThread() {
// ops that free memory, then ops that free data disk space, then ops that
// improve performance.
pair<MaintenanceOp*, string> MaintenanceManager::FindBestOp() {
+ SCOPED_LOG_SLOW_EXECUTION(WARNING, 10000, "finding best maintenance
operation");
TRACE_EVENT0("maintenance", "MaintenanceManager::FindBestOp");
if (!HasFreeThreads()) {
return {nullptr, "no free threads"};
}
+ const auto start_time = MonoTime::Now();
+ SCOPED_CLEANUP({
+ metrics_.SubmitOpFindDuration(MonoTime::Now() - start_time);
+ });
+
int64_t low_io_most_logs_retained_bytes = 0;
MaintenanceOp* low_io_most_logs_retained_bytes_op = nullptr;
diff --git a/src/kudu/util/maintenance_manager.h
b/src/kudu/util/maintenance_manager.h
index 760eb8e..f14c498 100644
--- a/src/kudu/util/maintenance_manager.h
+++ b/src/kudu/util/maintenance_manager.h
@@ -36,6 +36,7 @@
#include "kudu/gutil/ref_counted.h"
#include "kudu/util/condition_variable.h"
#include "kudu/util/locks.h"
+#include "kudu/util/maintenance_manager_metrics.h"
#include "kudu/util/monotime.h"
#include "kudu/util/mutex.h"
#include "kudu/util/random.h"
@@ -46,10 +47,10 @@ class Histogram;
class MaintenanceManager;
class MaintenanceManagerStatusPB;
class MaintenanceManagerStatusPB_OpInstancePB;
+class MetricEntity;
class Thread;
class ThreadPool;
-template<class T>
-class AtomicGauge;
+template <typename T> class AtomicGauge;
class MaintenanceOpStats {
public:
@@ -303,7 +304,9 @@ class MaintenanceManager : public
std::enable_shared_from_this<MaintenanceManage
uint32_t history_size;
};
- MaintenanceManager(const Options& options, std::string server_uuid);
+ MaintenanceManager(const Options& options,
+ std::string server_uuid,
+ const scoped_refptr<MetricEntity>& metric_entity);
~MaintenanceManager();
// Start running the maintenance manager.
@@ -433,6 +436,9 @@ class MaintenanceManager : public
std::enable_shared_from_this<MaintenanceManage
// Protected by running_instances_lock_;
std::unordered_map<int64_t, OpInstance*> running_instances_;
+ // MM-specific metrics.
+ MaintenanceManagerMetrics metrics_;
+
DISALLOW_COPY_AND_ASSIGN(MaintenanceManager);
};
diff --git a/src/kudu/util/maintenance_manager_metrics.cc
b/src/kudu/util/maintenance_manager_metrics.cc
new file mode 100644
index 0000000..6447489
--- /dev/null
+++ b/src/kudu/util/maintenance_manager_metrics.cc
@@ -0,0 +1,58 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "kudu/util/maintenance_manager_metrics.h"
+
+#include "kudu/util/metrics.h"
+#include "kudu/util/monotime.h"
+
+// Prepare() for a maintenance operation might fail only due to really rare
+// circumstances. For example, if an operation collides with another running
+// instance of the same operation.
+METRIC_DEFINE_counter(server, maintenance_op_prepare_failed,
+ "Number Of Operations With Failed Prepare()",
+ kudu::MetricUnit::kOperations,
+ "Number of times when calling Prepare() on a maintenance
"
+ "operation failed",
+ kudu::MetricLevel::kWarn);
+
+METRIC_DEFINE_histogram(server, maintenance_op_find_best_candidate_duration,
+ "Time Taken To Find Best Maintenance Operation",
+ kudu::MetricUnit::kMicroseconds,
+ "Time spent choosing a maintenance operation with "
+ "highest scores among available candidates",
+ kudu::MetricLevel::kInfo,
+ 60000000LU, 2);
+
+namespace kudu {
+
+MaintenanceManagerMetrics::MaintenanceManagerMetrics(
+ const scoped_refptr<MetricEntity>& metric_entity)
+ : op_find_duration(
+
METRIC_maintenance_op_find_best_candidate_duration.Instantiate(metric_entity)),
+
op_prepare_failed(METRIC_maintenance_op_prepare_failed.Instantiate(metric_entity))
{
+}
+
+void MaintenanceManagerMetrics::SubmitOpFindDuration(const MonoDelta&
duration) {
+ op_find_duration->Increment(duration.ToMicroseconds());
+}
+
+void MaintenanceManagerMetrics::SubmitOpPrepareFailed() {
+ op_prepare_failed->Increment();
+}
+
+} // namespace kudu
diff --git a/src/kudu/util/maintenance_manager_metrics.h
b/src/kudu/util/maintenance_manager_metrics.h
new file mode 100644
index 0000000..fbd9bcf
--- /dev/null
+++ b/src/kudu/util/maintenance_manager_metrics.h
@@ -0,0 +1,46 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "kudu/gutil/ref_counted.h"
+#include "kudu/util/metrics.h"
+
+namespace kudu {
+class MonoDelta;
+
+// An utility class to help with keeping track of the
MaintenanceManager-related
+// metrics.
+struct MaintenanceManagerMetrics {
+ explicit MaintenanceManagerMetrics(const scoped_refptr<MetricEntity>&
entity);
+
+ // Submit stats on the time spent choosing the best available maintenance
+ // operation among available candidates. The 'duration' parameter reflects
+ // the amount of time spent on finding the best candidate.
+ void SubmitOpFindDuration(const MonoDelta& duration);
+
+ // Increment the 'op_prepare_failed' counter.
+ void SubmitOpPrepareFailed();
+
+ // Time spent on choosing the "best" maintenance operation among candidates.
+ scoped_refptr<Histogram> op_find_duration;
+
+ // Number of times when running Prepare() failed on a maintenance operation.
+ scoped_refptr<Counter> op_prepare_failed;
+};
+
+} // namespace kudu