This is an automated email from the ASF dual-hosted git repository.

alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git


The following commit(s) were added to refs/heads/master by this push:
     new 2880d3a  [util] add a few new metrics in MaintenanceManager
2880d3a is described below

commit 2880d3a0dc0fc07d20fab79f1d0df3a6a622e398
Author: Alexey Serbin <[email protected]>
AuthorDate: Sun Jan 10 19:57:34 2021 -0800

    [util] add a few new metrics in MaintenanceManager
    
    This patch adds a couple of metrics for MaintenanceManager to track the
    duration of choosing the best candidate among available maintenance
    operations and number of times the Prepare() method for a maintenance
    operation failed:
      * maintenance_op_find_best_candidate_duration
      * maintenance_op_prepare_failed
    
    In addition, it adds SCOPED_LOG_SLOW_EXECUTION with the threshold of
    10 seconds into the MaintenanceManager::FindBestOp() method.
    
    At this point, I manually verified that those metrics are present and
    show relevant information.  I'm planning to add an automated test
    to cover the behavior of these new metrics in [1] to have less conflicts
    with the mentioned patch.
    
    The motivation for this change is a finding that FindBestOp()'s
    computational complexity is O(n^2) of the number of replicas per tablet
    server (each tablet replica registers about 8 maintenance operations).
    Also, BudgetedCompactionPolicy::RunApproximation()'s computational
    complexity is O(n^2) of the number of rowset in max and min keys.
    In the wild, there was an instance of a Kudu cluster with high data
    ingest ratio with the following stack showing in every snapshot in the
    diagnostic logs for many hours in a row:
    
         0xa11735 kudu::tablet::BudgetedCompactionPolicy::RunApproximation()
         0xa129c9 kudu::tablet::BudgetedCompactionPolicy::PickRowSets()
         0x9c8d80 kudu::tablet::Tablet::UpdateCompactionStats()
         0x9ec848 kudu::tablet::CompactRowSetsOp::UpdateStats()
        0x1b3de5c kudu::MaintenanceManager::FindBestOp()
        0x1b3f3c5 kudu::MaintenanceManager::RunSchedulerThread()
        0x1b86014 kudu::Thread::SuperviseThread()
    
    [1] https://gerrit.cloudera.org/#/c/16937/
    
    Change-Id: If5420afd605f9bd22207af142b49e73336907486
    Reviewed-on: http://gerrit.cloudera.org:8080/16938
    Reviewed-by: Andrew Wong <[email protected]>
    Tested-by: Kudu Jenkins
---
 src/kudu/master/master.cc                    |  4 +-
 src/kudu/tserver/tablet_server.cc            |  2 +-
 src/kudu/util/CMakeLists.txt                 |  1 +
 src/kudu/util/maintenance_manager-test.cc    | 12 +++++-
 src/kudu/util/maintenance_manager.cc         | 16 ++++++--
 src/kudu/util/maintenance_manager.h          | 12 ++++--
 src/kudu/util/maintenance_manager_metrics.cc | 58 ++++++++++++++++++++++++++++
 src/kudu/util/maintenance_manager_metrics.h  | 46 ++++++++++++++++++++++
 8 files changed, 141 insertions(+), 10 deletions(-)

diff --git a/src/kudu/master/master.cc b/src/kudu/master/master.cc
index 4f693d9..45a8837 100644
--- a/src/kudu/master/master.cc
+++ b/src/kudu/master/master.cc
@@ -200,7 +200,9 @@ Status Master::Init() {
   }
 
   maintenance_manager_.reset(new MaintenanceManager(
-      MaintenanceManager::kDefaultOptions, fs_manager_->uuid()));
+      MaintenanceManager::kDefaultOptions,
+      fs_manager_->uuid(),
+      metric_entity()));
 
   // The certificate authority object is initialized upon loading
   // CA private key and certificate from the system table when the server
diff --git a/src/kudu/tserver/tablet_server.cc 
b/src/kudu/tserver/tablet_server.cc
index c8b87f3..6aa59ff 100644
--- a/src/kudu/tserver/tablet_server.cc
+++ b/src/kudu/tserver/tablet_server.cc
@@ -100,7 +100,7 @@ Status TabletServer::Init() {
   }
 
   maintenance_manager_ = std::make_shared<MaintenanceManager>(
-      MaintenanceManager::kDefaultOptions, fs_manager_->uuid());
+      MaintenanceManager::kDefaultOptions, fs_manager_->uuid(), 
metric_entity());
 
   heartbeater_.reset(new Heartbeater(std::move(master_addrs), this));
 
diff --git a/src/kudu/util/CMakeLists.txt b/src/kudu/util/CMakeLists.txt
index b340f04..fc5955b 100644
--- a/src/kudu/util/CMakeLists.txt
+++ b/src/kudu/util/CMakeLists.txt
@@ -197,6 +197,7 @@ set(UTIL_SRCS
   locks.cc
   logging.cc
   maintenance_manager.cc
+  maintenance_manager_metrics.cc
   malloc.cc
   memcmpable_varint.cc
   memory/arena.cc
diff --git a/src/kudu/util/maintenance_manager-test.cc 
b/src/kudu/util/maintenance_manager-test.cc
index e43e60d..99a8b39 100644
--- a/src/kudu/util/maintenance_manager-test.cc
+++ b/src/kudu/util/maintenance_manager-test.cc
@@ -75,6 +75,11 @@ static const char kFakeUuid[] = "12345";
 
 class MaintenanceManagerTest : public KuduTest {
  public:
+  MaintenanceManagerTest()
+      : metric_entity_(METRIC_ENTITY_server.Instantiate(
+                       &metric_registry_, "test_entity")) {
+  }
+
   void SetUp() override {
     StartManager(2);
   }
@@ -88,9 +93,9 @@ class MaintenanceManagerTest : public KuduTest {
     options.num_threads = num_threads;
     options.polling_interval_ms = 1;
     options.history_size = kHistorySize;
-    manager_.reset(new MaintenanceManager(options, kFakeUuid));
+    manager_.reset(new MaintenanceManager(options, kFakeUuid, metric_entity_));
     manager_->set_memory_pressure_func_for_tests(
-        [&](double* consumption) {
+        [&](double* /* consumption */) {
           return indicate_memory_pressure_.load();
         });
     ASSERT_OK(manager_->Start());
@@ -101,6 +106,9 @@ class MaintenanceManagerTest : public KuduTest {
   }
 
  protected:
+  MetricRegistry metric_registry_;
+  scoped_refptr<MetricEntity> metric_entity_;
+
   shared_ptr<MaintenanceManager> manager_;
   std::atomic<bool> indicate_memory_pressure_ { false };
 };
diff --git a/src/kudu/util/maintenance_manager.cc 
b/src/kudu/util/maintenance_manager.cc
index 75ffd8b..64a361a 100644
--- a/src/kudu/util/maintenance_manager.cc
+++ b/src/kudu/util/maintenance_manager.cc
@@ -171,8 +171,10 @@ const MaintenanceManager::Options 
MaintenanceManager::kDefaultOptions = {
   .history_size = 0,
 };
 
-MaintenanceManager::MaintenanceManager(const Options& options,
-                                       string server_uuid)
+MaintenanceManager::MaintenanceManager(
+    const Options& options,
+    string server_uuid,
+    const scoped_refptr<MetricEntity>& metric_entity)
     : server_uuid_(std::move(server_uuid)),
       num_threads_(options.num_threads > 0
                    ? options.num_threads
@@ -186,7 +188,8 @@ MaintenanceManager::MaintenanceManager(const Options& 
options,
       running_ops_(0),
       completed_ops_count_(0),
       rand_(GetRandomSeed32()),
-      memory_pressure_func_(&process_memory::UnderMemoryPressure) {
+      memory_pressure_func_(&process_memory::UnderMemoryPressure),
+      metrics_(CHECK_NOTNULL(metric_entity)) {
   CHECK_OK(ThreadPoolBuilder("MaintenanceMgr")
                .set_min_threads(num_threads_)
                .set_max_threads(num_threads_)
@@ -361,6 +364,7 @@ void MaintenanceManager::RunSchedulerThread() {
     if (!op->Prepare()) {
       LOG_WITH_PREFIX(INFO) << "Prepare failed for " << op->name()
                             << ". Re-running scheduler.";
+      metrics_.SubmitOpPrepareFailed();
       std::lock_guard<Mutex> guard(running_instances_lock_);
       DecreaseOpCountAndNotifyWaiters(op);
       continue;
@@ -392,12 +396,18 @@ void MaintenanceManager::RunSchedulerThread() {
 // ops that free memory, then ops that free data disk space, then ops that
 // improve performance.
 pair<MaintenanceOp*, string> MaintenanceManager::FindBestOp() {
+  SCOPED_LOG_SLOW_EXECUTION(WARNING, 10000, "finding best maintenance 
operation");
   TRACE_EVENT0("maintenance", "MaintenanceManager::FindBestOp");
 
   if (!HasFreeThreads()) {
     return {nullptr, "no free threads"};
   }
 
+  const auto start_time = MonoTime::Now();
+  SCOPED_CLEANUP({
+    metrics_.SubmitOpFindDuration(MonoTime::Now() - start_time);
+  });
+
   int64_t low_io_most_logs_retained_bytes = 0;
   MaintenanceOp* low_io_most_logs_retained_bytes_op = nullptr;
 
diff --git a/src/kudu/util/maintenance_manager.h 
b/src/kudu/util/maintenance_manager.h
index 760eb8e..f14c498 100644
--- a/src/kudu/util/maintenance_manager.h
+++ b/src/kudu/util/maintenance_manager.h
@@ -36,6 +36,7 @@
 #include "kudu/gutil/ref_counted.h"
 #include "kudu/util/condition_variable.h"
 #include "kudu/util/locks.h"
+#include "kudu/util/maintenance_manager_metrics.h"
 #include "kudu/util/monotime.h"
 #include "kudu/util/mutex.h"
 #include "kudu/util/random.h"
@@ -46,10 +47,10 @@ class Histogram;
 class MaintenanceManager;
 class MaintenanceManagerStatusPB;
 class MaintenanceManagerStatusPB_OpInstancePB;
+class MetricEntity;
 class Thread;
 class ThreadPool;
-template<class T>
-class AtomicGauge;
+template <typename T> class AtomicGauge;
 
 class MaintenanceOpStats {
  public:
@@ -303,7 +304,9 @@ class MaintenanceManager : public 
std::enable_shared_from_this<MaintenanceManage
     uint32_t history_size;
   };
 
-  MaintenanceManager(const Options& options, std::string server_uuid);
+  MaintenanceManager(const Options& options,
+                     std::string server_uuid,
+                     const scoped_refptr<MetricEntity>& metric_entity);
   ~MaintenanceManager();
 
   // Start running the maintenance manager.
@@ -433,6 +436,9 @@ class MaintenanceManager : public 
std::enable_shared_from_this<MaintenanceManage
   // Protected by running_instances_lock_;
   std::unordered_map<int64_t, OpInstance*> running_instances_;
 
+  // MM-specific metrics.
+  MaintenanceManagerMetrics metrics_;
+
   DISALLOW_COPY_AND_ASSIGN(MaintenanceManager);
 };
 
diff --git a/src/kudu/util/maintenance_manager_metrics.cc 
b/src/kudu/util/maintenance_manager_metrics.cc
new file mode 100644
index 0000000..6447489
--- /dev/null
+++ b/src/kudu/util/maintenance_manager_metrics.cc
@@ -0,0 +1,58 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "kudu/util/maintenance_manager_metrics.h"
+
+#include "kudu/util/metrics.h"
+#include "kudu/util/monotime.h"
+
+// Prepare() for a maintenance operation might fail only due to really rare
+// circumstances. For example, if an operation collides with another running
+// instance of the same operation.
+METRIC_DEFINE_counter(server, maintenance_op_prepare_failed,
+                      "Number Of Operations With Failed Prepare()",
+                      kudu::MetricUnit::kOperations,
+                      "Number of times when calling Prepare() on a maintenance 
"
+                      "operation failed",
+                      kudu::MetricLevel::kWarn);
+
+METRIC_DEFINE_histogram(server, maintenance_op_find_best_candidate_duration,
+                        "Time Taken To Find Best Maintenance Operation",
+                        kudu::MetricUnit::kMicroseconds,
+                        "Time spent choosing a maintenance operation with "
+                        "highest scores among available candidates",
+                        kudu::MetricLevel::kInfo,
+                        60000000LU, 2);
+
+namespace kudu {
+
+MaintenanceManagerMetrics::MaintenanceManagerMetrics(
+    const scoped_refptr<MetricEntity>& metric_entity)
+    : op_find_duration(
+          
METRIC_maintenance_op_find_best_candidate_duration.Instantiate(metric_entity)),
+      
op_prepare_failed(METRIC_maintenance_op_prepare_failed.Instantiate(metric_entity))
 {
+}
+
+void MaintenanceManagerMetrics::SubmitOpFindDuration(const MonoDelta& 
duration) {
+  op_find_duration->Increment(duration.ToMicroseconds());
+}
+
+void MaintenanceManagerMetrics::SubmitOpPrepareFailed() {
+  op_prepare_failed->Increment();
+}
+
+} // namespace kudu
diff --git a/src/kudu/util/maintenance_manager_metrics.h 
b/src/kudu/util/maintenance_manager_metrics.h
new file mode 100644
index 0000000..fbd9bcf
--- /dev/null
+++ b/src/kudu/util/maintenance_manager_metrics.h
@@ -0,0 +1,46 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "kudu/gutil/ref_counted.h"
+#include "kudu/util/metrics.h"
+
+namespace kudu {
+class MonoDelta;
+
+// An utility class to help with keeping track of the 
MaintenanceManager-related
+// metrics.
+struct MaintenanceManagerMetrics {
+  explicit MaintenanceManagerMetrics(const scoped_refptr<MetricEntity>& 
entity);
+
+  // Submit stats on the time spent choosing the best available maintenance
+  // operation among available candidates. The 'duration' parameter reflects
+  // the amount of time spent on finding the best candidate.
+  void SubmitOpFindDuration(const MonoDelta& duration);
+
+  // Increment the 'op_prepare_failed' counter.
+  void SubmitOpPrepareFailed();
+
+  // Time spent on choosing the "best" maintenance operation among candidates.
+  scoped_refptr<Histogram> op_find_duration;
+
+  // Number of times when running Prepare() failed on a maintenance operation.
+  scoped_refptr<Counter> op_prepare_failed;
+};
+
+} // namespace kudu

Reply via email to