This is an automated email from the ASF dual-hosted git repository. laiyingchun pushed a commit to tag kudu-1.12.0-mdh1.0.0-4c2c075-centos-release in repository https://gitbox.apache.org/repos/asf/kudu.git
commit dac73ac23d088058c11262358c7a5a3e65150dec Author: zhangyifan27 <[email protected]> AuthorDate: Mon Aug 10 19:39:21 2020 +0800 KUDU-3180: prioritize larger mem-stores in time-based flusing Current time-based flush policy will always pick a mem-store that haven't been flushed in a long time instead of a mem-store anchoring more memory, this may lead to: - more memory used by mem-stores. - more small rowsets on disk so we need to do more compaction. This patch improve current flush policy by considering both mem-stores' size and time since last flush. When a mem-store become large or old enough, it will be more likely to flush, then we can avoid anchoring large (but below the threshold) mem-stores or WALs for too long. Change-Id: I0a826643709a4990e40b0a49f89f4ea34f14163b Reviewed-on: http://gerrit.cloudera.org:8080/16319 Tested-by: Kudu Jenkins Reviewed-by: Andrew Wong <[email protected]> --- src/kudu/tablet/tablet_replica-test.cc | 22 ++++++++++++----- src/kudu/tablet/tablet_replica_mm_ops.cc | 42 ++++++++++++++++---------------- 2 files changed, 37 insertions(+), 27 deletions(-) diff --git a/src/kudu/tablet/tablet_replica-test.cc b/src/kudu/tablet/tablet_replica-test.cc index b4d1750..339f80a 100644 --- a/src/kudu/tablet/tablet_replica-test.cc +++ b/src/kudu/tablet/tablet_replica-test.cc @@ -673,16 +673,18 @@ TEST_F(TabletReplicaTest, TestFlushOpsPerfImprovements) { MaintenanceOpStats stats; - // Just on the threshold and not enough time has passed for a time-based flush. + // Just on the threshold and not enough time has passed for a time-based flush, + // we'll expect improvement equal to '1'. stats.set_ram_anchored(64 * 1024 * 1024); FlushOpPerfImprovementPolicy::SetPerfImprovementForFlush(&stats, 1); - ASSERT_EQ(0.0, stats.perf_improvement()); + ASSERT_EQ(1.0, stats.perf_improvement()); stats.Clear(); - // Just on the threshold and enough time has passed, we'll have a low improvement. - stats.set_ram_anchored(64 * 1024 * 1024); + // Below the threshold and enough time has passed, we'll have a low improvement. + stats.set_ram_anchored(2 * 1024 * 1024); FlushOpPerfImprovementPolicy::SetPerfImprovementForFlush(&stats, 3 * 60 * 1000); - ASSERT_GT(stats.perf_improvement(), 0.01); + ASSERT_LT(0.01, stats.perf_improvement()); + ASSERT_GT(0.1, stats.perf_improvement()); stats.Clear(); // Over the threshold, we expect improvement equal to the excess MB. @@ -692,11 +694,19 @@ TEST_F(TabletReplicaTest, TestFlushOpsPerfImprovements) { stats.Clear(); // Below the threshold but have been there a long time, closing in to 1.0. - stats.set_ram_anchored(30 * 1024 * 1024); + stats.set_ram_anchored(1); FlushOpPerfImprovementPolicy::SetPerfImprovementForFlush(&stats, 60 * 50 * 1000); ASSERT_LT(0.7, stats.perf_improvement()); ASSERT_GT(1.0, stats.perf_improvement()); stats.Clear(); + + // Approaching threshold, enough time has passed but haven't been there a long time, + // closing in to 1.0. + stats.set_ram_anchored(63 * 1024 * 1024); + FlushOpPerfImprovementPolicy::SetPerfImprovementForFlush(&stats, 3 * 60 * 1000); + ASSERT_LT(0.9, stats.perf_improvement()); + ASSERT_GT(1.0, stats.perf_improvement()); + stats.Clear(); } // Test that the schema of a tablet will be rolled forward upon replaying an diff --git a/src/kudu/tablet/tablet_replica_mm_ops.cc b/src/kudu/tablet/tablet_replica_mm_ops.cc index 61243a7..ea53453 100644 --- a/src/kudu/tablet/tablet_replica_mm_ops.cc +++ b/src/kudu/tablet/tablet_replica_mm_ops.cc @@ -17,11 +17,11 @@ #include "kudu/tablet/tablet_replica_mm_ops.h" +#include <algorithm> #include <map> #include <mutex> #include <ostream> #include <string> -#include <utility> #include <boost/optional/optional.hpp> #include <gflags/gflags.h> @@ -30,7 +30,6 @@ #include "kudu/common/common.pb.h" #include "kudu/gutil/macros.h" #include "kudu/gutil/port.h" -#include "kudu/gutil/strings/substitute.h" #include "kudu/tablet/tablet_metadata.h" #include "kudu/tablet/tablet_metrics.h" #include "kudu/util/flag_tags.h" @@ -63,18 +62,25 @@ TAG_FLAG(enable_log_gc, runtime); TAG_FLAG(enable_log_gc, unsafe); DEFINE_int32(flush_threshold_mb, 1024, - "Size at which MemRowSet flushes are triggered. " + "Size at which MRS/DMS flushes are triggered. " "A MRS can still flush below this threshold if it hasn't flushed in a while, " "or if the server-wide memory limit has been reached."); TAG_FLAG(flush_threshold_mb, experimental); TAG_FLAG(flush_threshold_mb, runtime); DEFINE_int32(flush_threshold_secs, 2 * 60, - "Number of seconds after which a non-empty MemRowSet will become flushable " + "Number of seconds after which a non-empty MRS/DMS will become flushable " "even if it is not large."); TAG_FLAG(flush_threshold_secs, experimental); TAG_FLAG(flush_threshold_secs, runtime); +DEFINE_int32(flush_upper_bound_ms, 60 * 60 * 1000, + "Number of milliseconds after which the time-based performance improvement " + "score of a non-empty MRS/DMS flush op will reach its maximum value. " + "The score may further increase as the MRS/DMS grows in size."); +TAG_FLAG(flush_upper_bound_ms, experimental); +TAG_FLAG(flush_upper_bound_ms, runtime); + DECLARE_bool(enable_workload_score_for_perf_improvement_ops); METRIC_DEFINE_gauge_uint32(tablet, log_gc_running, @@ -93,10 +99,6 @@ namespace kudu { namespace tablet { using std::map; -using strings::Substitute; - -// Upper bound for how long it takes to reach "full perf improvement" in time-based flushing. -const double kFlushUpperBoundMs = 60 * 60 * 1000; // // FlushOpPerfImprovementPolicy. @@ -106,27 +108,25 @@ void FlushOpPerfImprovementPolicy::SetPerfImprovementForFlush(MaintenanceOpStats double elapsed_ms) { double anchored_mb = static_cast<double>(stats->ram_anchored()) / (1024 * 1024); const double threshold_mb = FLAGS_flush_threshold_mb; - if (anchored_mb > threshold_mb) { + const double upper_bound_ms = FLAGS_flush_upper_bound_ms; + if (anchored_mb >= threshold_mb) { // If we're over the user-specified flush threshold, then consider the perf - // improvement to be 1 for every extra MB. This produces perf_improvement results - // which are much higher than most compactions would produce, and means that, when - // there is an MRS over threshold, a flush will almost always be selected instead of - // a compaction. That's not necessarily a good thing, but in the absence of better + // improvement to be 1 for every extra MB (at least 1). This produces perf_improvement + // results which are much higher than most compactions would produce, and means that, + // when there is an MRS over threshold, a flush will almost always be selected instead of + // a compaction. That's not necessarily a good thing, but in the absence of better // heuristics, it will do for now. double extra_mb = anchored_mb - threshold_mb; DCHECK_GE(extra_mb, 0); - stats->set_perf_improvement(extra_mb); + stats->set_perf_improvement(std::max(1.0, extra_mb)); } else if (elapsed_ms > FLAGS_flush_threshold_secs * 1000) { // Even if we aren't over the threshold, consider flushing if we haven't flushed // in a long time. But, don't give it a large perf_improvement score. We should // only do this if we really don't have much else to do, and if we've already waited a bit. - // The following will give an improvement that's between 0.0 and 1.0, gradually growing - // as 'elapsed_ms' approaches 'kFlushUpperBoundMs'. - double perf = elapsed_ms / kFlushUpperBoundMs; - if (perf > 1.0) { - perf = 1.0; - } - stats->set_perf_improvement(perf); + // The following will give an improvement that's between 0.0 and 1.0, gradually growing as + // 'elapsed_ms' approaches 'upper_bound_ms' or 'anchored_mb' approaches 'threshold_mb'. + double perf = std::max(elapsed_ms / upper_bound_ms, anchored_mb / threshold_mb); + stats->set_perf_improvement(std::min(1.0, perf)); } }
