This is an automated email from the ASF dual-hosted git repository.

laiyingchun pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit 1b99da532f52d143c46440c3903785d642fb45a3
Author: kedeng <[email protected]>
AuthorDate: Wed Jun 12 15:16:56 2024 +0800

    [metrics] Add metrics for tablet replication time
    
    Add tablet-level metric to track the time cost of
    replication between replicas.
    
    To verify the correctness of the new logic, I
    constructed a synchronization scenario based on
    write operations.
    
    The addition of monitoring items will aid in
    historical issue tracking and analysis, as well
    as facilitate the configuration of monitoring alarms.
    
    Change-Id: I4ea89c3954ceecabdb2998d2a8945f12a2fb8aa9
    Reviewed-on: http://gerrit.cloudera.org:8080/21507
    Reviewed-by: Zoltan Chovan <[email protected]>
    Reviewed-by: Yingchun Lai <[email protected]>
    Tested-by: Yingchun Lai <[email protected]>
---
 src/kudu/tablet/ops/op_driver.cc       |  5 ++++-
 src/kudu/tablet/tablet_metrics.cc      |  8 ++++++++
 src/kudu/tablet/tablet_metrics.h       |  1 +
 src/kudu/tablet/tablet_replica-test.cc | 18 ++++++++++++++++++
 4 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/src/kudu/tablet/ops/op_driver.cc b/src/kudu/tablet/ops/op_driver.cc
index 2f4b372d4..6fa6f9ded 100644
--- a/src/kudu/tablet/ops/op_driver.cc
+++ b/src/kudu/tablet/ops/op_driver.cc
@@ -150,7 +150,7 @@ Status OpDriver::Init(unique_ptr<Op> op,
     // Start the replica op in the thread that is updating consensus, for
     // non-leader ops.
     // Replica ops were already assigned a timestamp so we don't need to
-    // acquire locks before calling Start(). Starting the the op here gives a
+    // acquire locks before calling Start(). Starting the op here gives a
     // strong guarantee to consensus that the op is on mvcc when it moves
     // "safe" time so that we don't risk marking a timestamp "safe" before all
     // ops before it are in-flight are on mvcc.
@@ -440,6 +440,9 @@ void OpDriver::ReplicationFinished(const Status& status) {
     replication_duration = replication_finished_time - replication_start_time_;
   }
 
+  if (auto* metrics = op_->state()->tablet_replica()->tablet()->metrics()) {
+    
metrics->replication_duration->Increment(replication_duration.ToMicroseconds());
+  }
   TRACE_COUNTER_INCREMENT("replication_time_us", 
replication_duration.ToMicroseconds());
   TRACE("REPLICATION: finished");
 
diff --git a/src/kudu/tablet/tablet_metrics.cc 
b/src/kudu/tablet/tablet_metrics.cc
index 804aa48e7..783618af0 100644
--- a/src/kudu/tablet/tablet_metrics.cc
+++ b/src/kudu/tablet/tablet_metrics.cc
@@ -252,6 +252,13 @@ METRIC_DEFINE_histogram(tablet, alter_schema_duration,
                         kudu::MetricLevel::kDebug,
                         60000000LU, 2);
 
+METRIC_DEFINE_histogram(tablet, replication_duration,
+                        "Replica Replication Duration",
+                        kudu::MetricUnit::kMicroseconds,
+                        "Duration of replication between replicas on the 
leader.",
+                        kudu::MetricLevel::kDebug,
+                        60000000LU, 2);
+
 METRIC_DEFINE_histogram(tablet, 
write_op_duration_client_propagated_consistency,
   "Write Op Duration with Propagated Consistency",
   kudu::MetricUnit::kMicroseconds,
@@ -473,6 +480,7 @@ TabletMetrics::TabletMetrics(const 
scoped_refptr<MetricEntity>& entity)
     MINIT(write_op_duration_client_propagated_consistency),
     MINIT(write_op_duration_commit_wait_consistency),
     MINIT(alter_schema_duration),
+    MINIT(replication_duration),
     GINIT(flush_dms_running),
     GINIT(flush_mrs_running),
     GINIT(compact_rs_running),
diff --git a/src/kudu/tablet/tablet_metrics.h b/src/kudu/tablet/tablet_metrics.h
index 5bd9620ce..5c75ecb3a 100644
--- a/src/kudu/tablet/tablet_metrics.h
+++ b/src/kudu/tablet/tablet_metrics.h
@@ -90,6 +90,7 @@ struct TabletMetrics {
   scoped_refptr<Histogram> write_op_duration_client_propagated_consistency;
   scoped_refptr<Histogram> write_op_duration_commit_wait_consistency;
   scoped_refptr<Histogram> alter_schema_duration;
+  scoped_refptr<Histogram> replication_duration;
 
   scoped_refptr<AtomicGauge<uint32_t> > flush_dms_running;
   scoped_refptr<AtomicGauge<uint32_t> > flush_mrs_running;
diff --git a/src/kudu/tablet/tablet_replica-test.cc 
b/src/kudu/tablet/tablet_replica-test.cc
index c9885a645..453309330 100644
--- a/src/kudu/tablet/tablet_replica-test.cc
+++ b/src/kudu/tablet/tablet_replica-test.cc
@@ -771,5 +771,23 @@ TEST_F(TabletReplicaTest, RowLocksLongWaitAndLogging) {
   t1.join();
 }
 
+// Test the replication duration metric works.
+TEST_F(TabletReplicaTest, TestReplicationDurationMetric) {
+  ConsensusBootstrapInfo info;
+  ASSERT_OK(StartReplicaAndWaitUntilLeader(info));
+
+  // The metric should be zero at the beginning.
+  ASSERT_EQ(0, 
tablet_replica_->tablet()->metrics()->replication_duration->TotalCount());
+
+  auto req = std::make_unique<WriteRequestPB>();
+  ASSERT_OK(GenerateSequentialInsertRequest(GetTestSchema(), req.get()));
+  ASSERT_OK(ExecuteWrite(tablet_replica_.get(), *req));
+
+  // The metric should be non-zero after the write completes.
+  ASSERT_EVENTUALLY([&]{
+    ASSERT_EQ(1, 
tablet_replica_->tablet()->metrics()->replication_duration->TotalCount());
+  });
+}
+
 } // namespace tablet
 } // namespace kudu

Reply via email to