maintenance_manager: log the reason for scheduling each operation This makes it easier to troubleshoot when the maintenance manager appears to be scheduling the "wrong" operation.
Example output from a long run of full_stack_insert_scan-test: ...: Scheduling FlushMRSOp(93ded54b4dfb4e1586ff7fe700184f53): under memory pressure (60.30% used, can flush 641875735 bytes) ...: Scheduling LogGCOp(93ded54b4dfb4e1586ff7fe700184f53): free 386079935 bytes of WAL ...: Scheduling UndoDeltaBlockGCOp(93ded54b4dfb4e1586ff7fe700184f53): 29857788 bytes on disk ...: Scheduling FlushMRSOp(93ded54b4dfb4e1586ff7fe700184f53): under memory pressure (60.01% used, can flush 637714199 bytes) ...: Scheduling LogGCOp(93ded54b4dfb4e1586ff7fe700184f53): free 394543122 bytes of WAL ...: Scheduling CompactRowSetsOp(93ded54b4dfb4e1586ff7fe700184f53): perf score=0.281697 ...: Scheduling CompactRowSetsOp(93ded54b4dfb4e1586ff7fe700184f53): perf score=0.280992 ...: Scheduling CompactRowSetsOp(93ded54b4dfb4e1586ff7fe700184f53): perf score=0.280256 ...: Scheduling CompactRowSetsOp(93ded54b4dfb4e1586ff7fe700184f53): perf score=0.060532 ...: Scheduling CompactRowSetsOp(93ded54b4dfb4e1586ff7fe700184f53): perf score=0.060298 ...: Scheduling UndoDeltaBlockGCOp(93ded54b4dfb4e1586ff7fe700184f53): 56855045 bytes on disk ...: Scheduling CompactRowSetsOp(93ded54b4dfb4e1586ff7fe700184f53): perf score=0.054961 ...: Scheduling UndoDeltaBlockGCOp(93ded54b4dfb4e1586ff7fe700184f53): 7202893 bytes on disk ...: Scheduling FlushMRSOp(93ded54b4dfb4e1586ff7fe700184f53): under memory pressure (60.25% used, can flush 633552663 bytes) ...: Scheduling LogGCOp(93ded54b4dfb4e1586ff7fe700184f53): free 394836440 bytes of WAL ...: Scheduling CompactRowSetsOp(93ded54b4dfb4e1586ff7fe700184f53): perf score=0.192819 ...: Scheduling CompactRowSetsOp(93ded54b4dfb4e1586ff7fe700184f53): perf score=0.184820 ...: Scheduling CompactRowSetsOp(93ded54b4dfb4e1586ff7fe700184f53): perf score=0.184674 ...: Scheduling UndoDeltaBlockGCOp(93ded54b4dfb4e1586ff7fe700184f53): 70881575 bytes on disk ...: Scheduling CompactRowSetsOp(93ded54b4dfb4e1586ff7fe700184f53): perf score=0.127476 ...: Scheduling UndoDeltaBlockGCOp(93ded54b4dfb4e1586ff7fe700184f53): 18119656 bytes on disk ...: Scheduling CompactRowSetsOp(93ded54b4dfb4e1586ff7fe700184f53): perf score=0.127334 ...: Scheduling CompactRowSetsOp(93ded54b4dfb4e1586ff7fe700184f53): perf score=0.111677 ...: Scheduling UndoDeltaBlockGCOp(93ded54b4dfb4e1586ff7fe700184f53): 31714242 bytes on disk ...: Scheduling FlushMRSOp(93ded54b4dfb4e1586ff7fe700184f53): under memory pressure (60.06% used, can flush 624189207 bytes) ...: Scheduling LogGCOp(93ded54b4dfb4e1586ff7fe700184f53): free 377818508 bytes of WAL ...: Scheduling UndoDeltaBlockGCOp(93ded54b4dfb4e1586ff7fe700184f53): 29069301 bytes on disk ...: Scheduling CompactRowSetsOp(93ded54b4dfb4e1586ff7fe700184f53): perf score=0.144540 ...: Scheduling CompactRowSetsOp(93ded54b4dfb4e1586ff7fe700184f53): perf score=0.138843 ...: Scheduling UndoDeltaBlockGCOp(93ded54b4dfb4e1586ff7fe700184f53): 36867458 bytes on disk ...: Scheduling CompactRowSetsOp(93ded54b4dfb4e1586ff7fe700184f53): perf score=0.138827 ...: Scheduling CompactRowSetsOp(93ded54b4dfb4e1586ff7fe700184f53): perf score=0.122173 ...: Scheduling UndoDeltaBlockGCOp(93ded54b4dfb4e1586ff7fe700184f53): 31717417 bytes on disk ...: Scheduling CompactRowSetsOp(93ded54b4dfb4e1586ff7fe700184f53): perf score=0.121637 ...: Scheduling CompactRowSetsOp(93ded54b4dfb4e1586ff7fe700184f53): perf score=0.121104 ...: Scheduling CompactRowSetsOp(93ded54b4dfb4e1586ff7fe700184f53): perf score=0.088980 ...: Scheduling CompactRowSetsOp(93ded54b4dfb4e1586ff7fe700184f53): perf score=0.087296 ...: Scheduling UndoDeltaBlockGCOp(93ded54b4dfb4e1586ff7fe700184f53): 54371475 bytes on disk ...: Scheduling CompactRowSetsOp(93ded54b4dfb4e1586ff7fe700184f53): perf score=0.055906 ...: Scheduling UndoDeltaBlockGCOp(93ded54b4dfb4e1586ff7fe700184f53): 9063001 bytes on disk ...: Scheduling CompactRowSetsOp(93ded54b4dfb4e1586ff7fe700184f53): perf score=0.031908 ...: Scheduling UndoDeltaBlockGCOp(93ded54b4dfb4e1586ff7fe700184f53): 6840843 bytes on disk ...: Scheduling FlushMRSOp(93ded54b4dfb4e1586ff7fe700184f53): under memory pressure (60.08% used, can flush 614825751 bytes) ...: Scheduling LogGCOp(93ded54b4dfb4e1586ff7fe700184f53): free 377913596 bytes of WAL ...: Scheduling UndoDeltaBlockGCOp(93ded54b4dfb4e1586ff7fe700184f53): 28612520 bytes on disk ...: Scheduling CompactRowSetsOp(93ded54b4dfb4e1586ff7fe700184f53): perf score=0.113092 ...: Scheduling CompactRowSetsOp(93ded54b4dfb4e1586ff7fe700184f53): perf score=0.110844 ...: Scheduling UndoDeltaBlockGCOp(93ded54b4dfb4e1586ff7fe700184f53): 36863166 bytes on disk ...: Scheduling CompactRowSetsOp(93ded54b4dfb4e1586ff7fe700184f53): perf score=0.109090 ...: Scheduling UndoDeltaBlockGCOp(93ded54b4dfb4e1586ff7fe700184f53): 18129573 bytes on disk ...: Scheduling CompactRowSetsOp(93ded54b4dfb4e1586ff7fe700184f53): perf score=0.108572 ...: Scheduling CompactRowSetsOp(93ded54b4dfb4e1586ff7fe700184f53): perf score=0.108551 ...: Scheduling UndoDeltaBlockGCOp(93ded54b4dfb4e1586ff7fe700184f53): 36251849 bytes on disk ...: Scheduling CompactRowSetsOp(93ded54b4dfb4e1586ff7fe700184f53): perf score=0.105718 ...: Scheduling CompactRowSetsOp(93ded54b4dfb4e1586ff7fe700184f53): perf score=0.065384 Change-Id: I4dcdb863a7a0b0fc2a72757801d5c057fa725c34 Reviewed-on: http://gerrit.cloudera.org:8080/9172 Tested-by: Kudu Jenkins Reviewed-by: Will Berkeley <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/kudu/repo Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/29e795d5 Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/29e795d5 Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/29e795d5 Branch: refs/heads/master Commit: 29e795d58fc1a59102f358b4edb4faa39aa5d02b Parents: f81690d Author: Todd Lipcon <[email protected]> Authored: Wed Jan 31 14:45:31 2018 -0800 Committer: Todd Lipcon <[email protected]> Committed: Mon Mar 5 18:22:07 2018 +0000 ---------------------------------------------------------------------- src/kudu/util/debug/trace_logging.h | 26 +++++++-- src/kudu/util/maintenance_manager-test.cc | 17 ++++-- src/kudu/util/maintenance_manager.cc | 77 +++++++++++++------------- src/kudu/util/maintenance_manager.h | 7 ++- 4 files changed, 76 insertions(+), 51 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/kudu/blob/29e795d5/src/kudu/util/debug/trace_logging.h ---------------------------------------------------------------------- diff --git a/src/kudu/util/debug/trace_logging.h b/src/kudu/util/debug/trace_logging.h index c497562..1a3b39e 100644 --- a/src/kudu/util/debug/trace_logging.h +++ b/src/kudu/util/debug/trace_logging.h @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. // -// This header defines the following macro: +// This header defines the following macros: // // VLOG_AND_TRACE(category, vlevel) // @@ -39,6 +39,12 @@ // either trace recording or VLOG(n) is enabled. In the case that both are enabled, // the arguments are only evaluated once. // +// +// LOG_AND_TRACE(category, severity) +// +// Same as the above, but always logs at the given severity level in addition +// to writing to the trace buffer. + #ifndef KUDU_DEBUG_TRACE_LOGGING_H #define KUDU_DEBUG_TRACE_LOGGING_H @@ -56,23 +62,31 @@ // of the ternary expression and the log stream on the other. This technique is // cribbed from glog/logging.h. #define VLOG_AND_TRACE_INTERNAL(category, vlevel) \ - kudu::debug::TraceVLog(__FILE__, __LINE__, category, VLOG_IS_ON(vlevel)).stream() + kudu::debug::TraceGLog(__FILE__, __LINE__, category, google::GLOG_INFO, \ + /* send_to_log= */VLOG_IS_ON(vlevel)).stream() + #define VLOG_AND_TRACE(category, vlevel) \ !( { \ bool enabled; \ TRACE_EVENT_CATEGORY_GROUP_ENABLED(category, &enabled); \ enabled || VLOG_IS_ON(vlevel); \ } ) ? static_cast<void>(0) : \ - google::LogMessageVoidify() & VLOG_AND_TRACE_INTERNAL(category, vlevel) + google::LogMessageVoidify() & VLOG_AND_TRACE_INTERNAL(category, vlevel) // NOLINT(*) + + +#define LOG_AND_TRACE(category, severity) \ + kudu::debug::TraceGLog(__FILE__, __LINE__, category, \ + google::GLOG_ ## severity, /* send_to_log= */true).stream() namespace kudu { namespace debug { -class TraceVLog { +class TraceGLog { public: - TraceVLog(const char* file, int line, const char* category, bool do_vlog) + TraceGLog(const char* file, int line, const char* category, + google::LogSeverity severity, bool send_to_log) : sink_(category), - google_msg_(file, line, google::GLOG_INFO, &sink_, do_vlog) { + google_msg_(file, line, severity, &sink_, send_to_log) { } std::ostream& stream() { http://git-wip-us.apache.org/repos/asf/kudu/blob/29e795d5/src/kudu/util/maintenance_manager-test.cc ---------------------------------------------------------------------- diff --git a/src/kudu/util/maintenance_manager-test.cc b/src/kudu/util/maintenance_manager-test.cc index 7cc6398..1889139 100644 --- a/src/kudu/util/maintenance_manager-test.cc +++ b/src/kudu/util/maintenance_manager-test.cc @@ -21,6 +21,7 @@ #include <mutex> #include <ostream> #include <string> +#include <utility> #include <vector> #include <boost/bind.hpp> // IWYU pragma: keep @@ -284,22 +285,30 @@ TEST_F(MaintenanceManagerTest, TestLogRetentionPrioritization) { manager_->RegisterOp(&op3); // We want to do the low IO op first since it clears up some log retention. - ASSERT_EQ(&op1, manager_->FindBestOp()); + auto op_and_why = manager_->FindBestOp(); + ASSERT_EQ(&op1, op_and_why.first); + EXPECT_EQ(op_and_why.second, "free 104857600 bytes of WAL"); manager_->UnregisterOp(&op1); // Low IO is taken care of, now we find the op that clears the most log retention and ram. // However, with the default settings, we won't bother running any of these operations // which only retain 100MB of logs. - ASSERT_EQ(nullptr, manager_->FindBestOp()); + op_and_why = manager_->FindBestOp(); + ASSERT_EQ(nullptr, op_and_why.first); + EXPECT_EQ(op_and_why.second, "no ops with positive improvement"); // If we change the target WAL size, we will select these ops. FLAGS_log_target_replay_size_mb = 50; - ASSERT_EQ(&op3, manager_->FindBestOp()); + op_and_why = manager_->FindBestOp(); + ASSERT_EQ(&op3, op_and_why.first); + EXPECT_EQ(op_and_why.second, "104857600 bytes log retention"); manager_->UnregisterOp(&op3); - ASSERT_EQ(&op2, manager_->FindBestOp()); + op_and_why = manager_->FindBestOp(); + ASSERT_EQ(&op2, op_and_why.first); + EXPECT_EQ(op_and_why.second, "104857600 bytes log retention"); manager_->UnregisterOp(&op2); } http://git-wip-us.apache.org/repos/asf/kudu/blob/29e795d5/src/kudu/util/maintenance_manager.cc ---------------------------------------------------------------------- diff --git a/src/kudu/util/maintenance_manager.cc b/src/kudu/util/maintenance_manager.cc index 50d792e..61b2d0b 100644 --- a/src/kudu/util/maintenance_manager.cc +++ b/src/kudu/util/maintenance_manager.cc @@ -17,12 +17,14 @@ #include "kudu/util/maintenance_manager.h" +#include <cinttypes> #include <cstddef> #include <cstdint> #include <memory> #include <mutex> #include <sstream> #include <string> +#include <type_traits> #include <utility> #include <boost/bind.hpp> @@ -47,7 +49,7 @@ #include "kudu/util/trace.h" using std::pair; -using std::shared_ptr; +using std::string; using strings::Substitute; DEFINE_int32(maintenance_manager_num_threads, 1, @@ -269,7 +271,10 @@ void MaintenanceManager::RunSchedulerThread() { } // Find the best op. - MaintenanceOp* op = FindBestOp(); + pair<MaintenanceOp*, string> op_and_note = FindBestOp(); + auto* op = op_and_note.first; + const auto& note = op_and_note.second; + // If we found no work to do, then we should sleep before trying again to schedule. // Otherwise, we can go right into trying to find the next op. prev_iter_found_no_work = (op == nullptr); @@ -294,9 +299,11 @@ void MaintenanceManager::RunSchedulerThread() { continue; } + LOG_AND_TRACE("maintenance", INFO) << LogPrefix() << "Scheduling " + << op->name() << ": " << note; // Run the maintenance operation. Status s = thread_pool_->SubmitFunc(boost::bind( - &MaintenanceManager::LaunchOp, this, op)); + &MaintenanceManager::LaunchOp, this, op)); CHECK(s.ok()); } } @@ -321,14 +328,12 @@ void MaintenanceManager::RunSchedulerThread() { // sliding priority between log retention and RAM usage. For example, is an Op that frees // 128MB of log retention and 12MB of RAM always better than an op that frees 12MB of log retention // and 128MB of RAM? Maybe a more holistic approach would be better. -MaintenanceOp* MaintenanceManager::FindBestOp() { +pair<MaintenanceOp*, string> MaintenanceManager::FindBestOp() { TRACE_EVENT0("maintenance", "MaintenanceManager::FindBestOp"); size_t free_threads = num_threads_ - running_ops_; if (free_threads == 0) { - VLOG_AND_TRACE("maintenance", 1) << LogPrefix() - << "There are no free threads, so we can't run anything."; - return nullptr; + return {nullptr, "no free threads"}; } int64_t low_io_most_logs_retained_bytes = 0; @@ -396,12 +401,8 @@ MaintenanceOp* MaintenanceManager::FindBestOp() { // Look at ops that we can run quickly that free up log retention. if (low_io_most_logs_retained_bytes_op) { if (low_io_most_logs_retained_bytes > 0) { - VLOG_AND_TRACE("maintenance", 1) << LogPrefix() - << "Performing " << low_io_most_logs_retained_bytes_op->name() << ", " - << "because it can free up more logs " - << "at " << low_io_most_logs_retained_bytes - << " bytes with a low IO cost"; - return low_io_most_logs_retained_bytes_op; + string notes = Substitute("free $0 bytes of WAL", low_io_most_logs_retained_bytes); + return {low_io_most_logs_retained_bytes_op, std::move(notes)}; } } @@ -410,25 +411,22 @@ MaintenanceOp* MaintenanceManager::FindBestOp() { double capacity_pct; if (memory_pressure_func_(&capacity_pct)) { if (!most_mem_anchored_op) { - std::string msg = StringPrintf("we have exceeded our soft memory limit " - "(current capacity is %.2f%%). However, there are no ops currently " + std::string msg = StringPrintf("System under memory pressure " + "(%.2f%% of limit used). However, there are no ops currently " "runnable which would free memory.", capacity_pct); LOG_WITH_PREFIX(INFO) << msg; - return nullptr; + return {nullptr, msg}; } - VLOG_AND_TRACE("maintenance", 1) << LogPrefix() << "We have exceeded our soft memory limit " - << "(current capacity is " << capacity_pct << "%). Running the op " - << "which anchors the most memory: " << most_mem_anchored_op->name(); - return most_mem_anchored_op; + string note = StringPrintf("under memory pressure (%.2f%% used, " + "can flush %" PRIu64 " bytes)", + capacity_pct, most_mem_anchored); + return {most_mem_anchored_op, std::move(note)}; } if (most_logs_retained_bytes_op && most_logs_retained_bytes / 1024 / 1024 >= FLAGS_log_target_replay_size_mb) { - VLOG_AND_TRACE("maintenance", 1) << LogPrefix() - << "Performing " << most_logs_retained_bytes_op->name() << ", " - << "because it can free up more logs (" << most_logs_retained_bytes - << " bytes)"; - return most_logs_retained_bytes_op; + string note = Substitute("$0 bytes log retention", most_logs_retained_bytes); + return {most_logs_retained_bytes_op, std::move(note)}; } // Look at ops that we can run quickly that free up data on disk. @@ -436,23 +434,17 @@ MaintenanceOp* MaintenanceManager::FindBestOp() { most_data_retained_bytes > FLAGS_data_gc_min_size_mb * 1024 * 1024) { if (!best_perf_improvement_op || best_perf_improvement <= 0 || rand_.NextDoubleFraction() <= FLAGS_data_gc_prioritization_prob) { - VLOG_AND_TRACE("maintenance", 1) << LogPrefix() - << "Performing " << most_data_retained_bytes_op->name() << ", " - << "because it can free up more data " - << "at " << most_data_retained_bytes << " bytes"; - return most_data_retained_bytes_op; + string note = Substitute("$0 bytes on disk", most_data_retained_bytes); + return {most_data_retained_bytes_op, std::move(note)}; } VLOG(1) << "Skipping data GC due to prioritizing perf improvement"; } if (best_perf_improvement_op && best_perf_improvement > 0) { - VLOG_AND_TRACE("maintenance", 1) << LogPrefix() << "Performing " - << best_perf_improvement_op->name() << ", " - << "because it had the best perf_improvement score, " - << "at " << best_perf_improvement; - return best_perf_improvement_op; + string note = StringPrintf("perf score=%.6f", best_perf_improvement); + return {best_perf_improvement_op, std::move(note)}; } - return nullptr; + return {nullptr, "no ops with positive improvement"}; } void MaintenanceManager::LaunchOp(MaintenanceOp* op) { @@ -488,19 +480,26 @@ void MaintenanceManager::LaunchOp(MaintenanceOp* op) { }); scoped_refptr<Trace> trace(new Trace); - LOG_TIMING(INFO, Substitute("running $0", op->name())) { + Stopwatch sw; + sw.start(); + { ADOPT_TRACE(trace.get()); TRACE_EVENT1("maintenance", "MaintenanceManager::LaunchOp", "name", op->name()); op->Perform(); + sw.stop(); } - LOG_WITH_PREFIX(INFO) << op->name() << " metrics: " << trace->MetricsAsJSON(); + LOG_WITH_PREFIX(INFO) << op->name() << " complete. " + << "Timing: " << sw.elapsed().ToString() + << " Metrics: " << trace->MetricsAsJSON(); } void MaintenanceManager::GetMaintenanceManagerStatusDump(MaintenanceManagerStatusPB* out_pb) { DCHECK(out_pb != nullptr); std::lock_guard<Mutex> guard(lock_); - MaintenanceOp* best_op = FindBestOp(); + pair<MaintenanceOp*, string> best_op_and_why = FindBestOp(); + auto* best_op = best_op_and_why.first; + for (MaintenanceManager::OpMapTy::value_type& val : ops_) { MaintenanceManagerStatusPB_MaintenanceOpPB* op_pb = out_pb->add_registered_operations(); MaintenanceOp* op(val.first); http://git-wip-us.apache.org/repos/asf/kudu/blob/29e795d5/src/kudu/util/maintenance_manager.h ---------------------------------------------------------------------- diff --git a/src/kudu/util/maintenance_manager.h b/src/kudu/util/maintenance_manager.h index 888dc39..dc36f67 100644 --- a/src/kudu/util/maintenance_manager.h +++ b/src/kudu/util/maintenance_manager.h @@ -308,8 +308,11 @@ class MaintenanceManager : public std::enable_shared_from_this<MaintenanceManage void RunSchedulerThread(); - // find the best op, or null if there is nothing we want to run - MaintenanceOp* FindBestOp(); + // Find the best op, or null if there is nothing we want to run. + // + // Returns the op, as well as a string explanation of why that op was chosen, + // suitable for logging. + std::pair<MaintenanceOp*, std::string> FindBestOp(); void LaunchOp(MaintenanceOp* op);
