This is an automated email from the ASF dual-hosted git repository. laiyingchun pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/kudu.git
commit 1b99da532f52d143c46440c3903785d642fb45a3 Author: kedeng <[email protected]> AuthorDate: Wed Jun 12 15:16:56 2024 +0800 [metrics] Add metrics for tablet replication time Add tablet-level metric to track the time cost of replication between replicas. To verify the correctness of the new logic, I constructed a synchronization scenario based on write operations. The addition of monitoring items will aid in historical issue tracking and analysis, as well as facilitate the configuration of monitoring alarms. Change-Id: I4ea89c3954ceecabdb2998d2a8945f12a2fb8aa9 Reviewed-on: http://gerrit.cloudera.org:8080/21507 Reviewed-by: Zoltan Chovan <[email protected]> Reviewed-by: Yingchun Lai <[email protected]> Tested-by: Yingchun Lai <[email protected]> --- src/kudu/tablet/ops/op_driver.cc | 5 ++++- src/kudu/tablet/tablet_metrics.cc | 8 ++++++++ src/kudu/tablet/tablet_metrics.h | 1 + src/kudu/tablet/tablet_replica-test.cc | 18 ++++++++++++++++++ 4 files changed, 31 insertions(+), 1 deletion(-) diff --git a/src/kudu/tablet/ops/op_driver.cc b/src/kudu/tablet/ops/op_driver.cc index 2f4b372d4..6fa6f9ded 100644 --- a/src/kudu/tablet/ops/op_driver.cc +++ b/src/kudu/tablet/ops/op_driver.cc @@ -150,7 +150,7 @@ Status OpDriver::Init(unique_ptr<Op> op, // Start the replica op in the thread that is updating consensus, for // non-leader ops. // Replica ops were already assigned a timestamp so we don't need to - // acquire locks before calling Start(). Starting the the op here gives a + // acquire locks before calling Start(). Starting the op here gives a // strong guarantee to consensus that the op is on mvcc when it moves // "safe" time so that we don't risk marking a timestamp "safe" before all // ops before it are in-flight are on mvcc. @@ -440,6 +440,9 @@ void OpDriver::ReplicationFinished(const Status& status) { replication_duration = replication_finished_time - replication_start_time_; } + if (auto* metrics = op_->state()->tablet_replica()->tablet()->metrics()) { + metrics->replication_duration->Increment(replication_duration.ToMicroseconds()); + } TRACE_COUNTER_INCREMENT("replication_time_us", replication_duration.ToMicroseconds()); TRACE("REPLICATION: finished"); diff --git a/src/kudu/tablet/tablet_metrics.cc b/src/kudu/tablet/tablet_metrics.cc index 804aa48e7..783618af0 100644 --- a/src/kudu/tablet/tablet_metrics.cc +++ b/src/kudu/tablet/tablet_metrics.cc @@ -252,6 +252,13 @@ METRIC_DEFINE_histogram(tablet, alter_schema_duration, kudu::MetricLevel::kDebug, 60000000LU, 2); +METRIC_DEFINE_histogram(tablet, replication_duration, + "Replica Replication Duration", + kudu::MetricUnit::kMicroseconds, + "Duration of replication between replicas on the leader.", + kudu::MetricLevel::kDebug, + 60000000LU, 2); + METRIC_DEFINE_histogram(tablet, write_op_duration_client_propagated_consistency, "Write Op Duration with Propagated Consistency", kudu::MetricUnit::kMicroseconds, @@ -473,6 +480,7 @@ TabletMetrics::TabletMetrics(const scoped_refptr<MetricEntity>& entity) MINIT(write_op_duration_client_propagated_consistency), MINIT(write_op_duration_commit_wait_consistency), MINIT(alter_schema_duration), + MINIT(replication_duration), GINIT(flush_dms_running), GINIT(flush_mrs_running), GINIT(compact_rs_running), diff --git a/src/kudu/tablet/tablet_metrics.h b/src/kudu/tablet/tablet_metrics.h index 5bd9620ce..5c75ecb3a 100644 --- a/src/kudu/tablet/tablet_metrics.h +++ b/src/kudu/tablet/tablet_metrics.h @@ -90,6 +90,7 @@ struct TabletMetrics { scoped_refptr<Histogram> write_op_duration_client_propagated_consistency; scoped_refptr<Histogram> write_op_duration_commit_wait_consistency; scoped_refptr<Histogram> alter_schema_duration; + scoped_refptr<Histogram> replication_duration; scoped_refptr<AtomicGauge<uint32_t> > flush_dms_running; scoped_refptr<AtomicGauge<uint32_t> > flush_mrs_running; diff --git a/src/kudu/tablet/tablet_replica-test.cc b/src/kudu/tablet/tablet_replica-test.cc index c9885a645..453309330 100644 --- a/src/kudu/tablet/tablet_replica-test.cc +++ b/src/kudu/tablet/tablet_replica-test.cc @@ -771,5 +771,23 @@ TEST_F(TabletReplicaTest, RowLocksLongWaitAndLogging) { t1.join(); } +// Test the replication duration metric works. +TEST_F(TabletReplicaTest, TestReplicationDurationMetric) { + ConsensusBootstrapInfo info; + ASSERT_OK(StartReplicaAndWaitUntilLeader(info)); + + // The metric should be zero at the beginning. + ASSERT_EQ(0, tablet_replica_->tablet()->metrics()->replication_duration->TotalCount()); + + auto req = std::make_unique<WriteRequestPB>(); + ASSERT_OK(GenerateSequentialInsertRequest(GetTestSchema(), req.get())); + ASSERT_OK(ExecuteWrite(tablet_replica_.get(), *req)); + + // The metric should be non-zero after the write completes. + ASSERT_EVENTUALLY([&]{ + ASSERT_EQ(1, tablet_replica_->tablet()->metrics()->replication_duration->TotalCount()); + }); +} + } // namespace tablet } // namespace kudu
