This is an automated email from the ASF dual-hosted git repository.

alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git


The following commit(s) were added to refs/heads/master by this push:
     new d370e0e45 [metrics] Add metrics for tablet copy op time
d370e0e45 is described below

commit d370e0e4511508790c065340a52242ee09ecfea3
Author: kedeng <[email protected]>
AuthorDate: Thu Apr 25 11:41:19 2024 +0800

    [metrics] Add metrics for tablet copy op time
    
    Add server-level statistics to track the time consumption of
    copy tablet operations. This is effective both for the source
    tablet and destination tablet during the copy operation.
    
    The addition of monitoring items will aid in historical issue
    tracking and analysis, as well as facilitate the configuration
    of monitoring alarms.
    
    Change-Id: I088f6a9a8a07ad39ca95ae8b4995ce00d1a0d00c
    Reviewed-on: http://gerrit.cloudera.org:8080/21356
    Reviewed-by: Alexey Serbin <[email protected]>
    Tested-by: Alexey Serbin <[email protected]>
---
 src/kudu/integration-tests/tablet_copy-itest.cc | 41 +++++++++++++++++++++++++
 src/kudu/tserver/tablet_copy_client.cc          | 18 ++++++++++-
 src/kudu/tserver/tablet_copy_client.h           |  1 +
 src/kudu/tserver/tablet_copy_service.cc         |  1 +
 src/kudu/tserver/tablet_copy_source_session.cc  | 19 +++++++++++-
 src/kudu/tserver/tablet_copy_source_session.h   |  7 +++++
 6 files changed, 85 insertions(+), 2 deletions(-)

diff --git a/src/kudu/integration-tests/tablet_copy-itest.cc 
b/src/kudu/integration-tests/tablet_copy-itest.cc
index ba3e4be45..c1a0856ad 100644
--- a/src/kudu/integration-tests/tablet_copy-itest.cc
+++ b/src/kudu/integration-tests/tablet_copy-itest.cc
@@ -149,6 +149,8 @@ METRIC_DECLARE_counter(glog_error_messages);
 METRIC_DECLARE_counter(rows_inserted);
 METRIC_DECLARE_counter(tablet_copy_bytes_fetched);
 METRIC_DECLARE_counter(tablet_copy_bytes_sent);
+METRIC_DECLARE_histogram(tablet_copy_duration);
+METRIC_DECLARE_histogram(tablet_copy_source_duration);
 METRIC_DECLARE_gauge_int32(tablet_copy_open_client_sessions);
 METRIC_DECLARE_gauge_int32(tablet_copy_open_source_sessions);
 METRIC_DECLARE_gauge_uint64(log_block_manager_blocks_under_management);
@@ -1649,6 +1651,30 @@ int64_t TabletCopyBytesFetched(ExternalTabletServer* 
ets) {
   return ret;
 }
 
+int64_t TabletCopySourceDurationTotalCount(ExternalTabletServer* ets) {
+  int64_t ret;
+  CHECK_OK(GetInt64Metric(
+      ets->bound_http_hostport(),
+      &METRIC_ENTITY_server,
+      "kudu.tabletserver",
+      &METRIC_tablet_copy_source_duration,
+      "total_count",
+      &ret));
+  return ret;
+}
+
+int64_t TabletCopyDurationTotalCount(ExternalTabletServer* ets) {
+  int64_t ret;
+  CHECK_OK(GetInt64Metric(
+      ets->bound_http_hostport(),
+      &METRIC_ENTITY_server,
+      "kudu.tabletserver",
+      &METRIC_tablet_copy_duration,
+      "total_count",
+      &ret));
+  return ret;
+}
+
 int64_t TabletCopyOpenSourceSessions(ExternalTabletServer* ets) {
   int64_t ret;
   CHECK_OK(GetInt64Metric(
@@ -1713,6 +1739,14 @@ TEST_F(TabletCopyITest, TestTabletCopyMetrics) {
   follower_index = (leader_index + 1) % cluster_->num_tablet_servers();
   follower_ts = ts_map_[cluster_->tablet_server(follower_index)->uuid()];
 
+  // Before we start the tablet copy, the metrics count should be zero.
+  int64_t copy_source_duration_cnt_before =
+      
TabletCopySourceDurationTotalCount(cluster_->tablet_server(leader_index));
+  int64_t copy_duration_cnt_before =
+      TabletCopyDurationTotalCount(cluster_->tablet_server(follower_index));
+  ASSERT_EQ(0, copy_source_duration_cnt_before);
+  ASSERT_EQ(0, copy_duration_cnt_before);
+
   LOG(INFO) << "Tombstoning follower tablet " << tablet_id
             << " on TS " << follower_ts->uuid();
   ASSERT_OK(DeleteTablet(follower_ts, tablet_id, TABLET_DATA_TOMBSTONED, 
kTimeout));
@@ -1744,6 +1778,13 @@ TEST_F(TabletCopyITest, TestTabletCopyMetrics) {
   ASSERT_OK(WaitForServersToAgree(kTimeout, ts_map_, tablet_id,
                                   workload.batches_completed()));
 
+  // After copying, the metrics count should be greater than zero.
+  int64_t copy_source_duration_cnt =
+      
TabletCopySourceDurationTotalCount(cluster_->tablet_server(leader_index));
+  int64_t copy_duration_cnt = 
TabletCopyDurationTotalCount(cluster_->tablet_server(follower_index));
+  ASSERT_GT(copy_source_duration_cnt, 0);
+  ASSERT_GT(copy_duration_cnt, 0);
+
   ASSERT_EQ(0, 
TabletCopyOpenSourceSessions(cluster_->tablet_server(leader_index)));
   ASSERT_EQ(0, 
TabletCopyOpenClientSessions(cluster_->tablet_server(follower_index)));
 
diff --git a/src/kudu/tserver/tablet_copy_client.cc 
b/src/kudu/tserver/tablet_copy_client.cc
index fc49f42a1..d392c72c2 100644
--- a/src/kudu/tserver/tablet_copy_client.cc
+++ b/src/kudu/tserver/tablet_copy_client.cc
@@ -172,6 +172,13 @@ METRIC_DEFINE_gauge_int32(server, 
tablet_copy_open_client_sessions,
                           "Number of currently open tablet copy client 
sessions on this server",
                           kudu::MetricLevel::kInfo);
 
+METRIC_DEFINE_histogram(server, tablet_copy_duration,
+                        "Tablet Copy Duration",
+                        kudu::MetricUnit::kMilliseconds,
+                        "Duration of tablet copying as destination.",
+                        kudu::MetricLevel::kDebug,
+                        3600000LU, 1);
+
 // RETURN_NOT_OK_PREPEND() with a remote-error unwinding step.
 #define RETURN_NOT_OK_UNWIND_PREPEND(status, controller, msg) \
   RETURN_NOT_OK_PREPEND(UnwindRemoteError(status, controller), msg)
@@ -206,7 +213,8 @@ namespace tserver {
 
 TabletCopyClientMetrics::TabletCopyClientMetrics(const 
scoped_refptr<MetricEntity>& metric_entity)
     : 
bytes_fetched(METRIC_tablet_copy_bytes_fetched.Instantiate(metric_entity)),
-      
open_client_sessions(METRIC_tablet_copy_open_client_sessions.Instantiate(metric_entity,
 0)) {
+      
open_client_sessions(METRIC_tablet_copy_open_client_sessions.Instantiate(metric_entity,
 0)),
+      copy_duration(METRIC_tablet_copy_duration.Instantiate(metric_entity)) {
 }
 
 TabletCopyClient::TabletCopyClient(
@@ -559,6 +567,10 @@ Status TabletCopyClient::Finish() {
   // Now that we've finished everything, complete.
   revert_activate_superblock.cancel();
   state_ = kFinished;
+  if (dst_tablet_copy_metrics_) {
+    int64_t dur = GetCurrentTimeMicros() - start_time_micros_;
+    dst_tablet_copy_metrics_->copy_duration->Increment(dur / 1000);
+  }
   return Status::OK();
 }
 
@@ -576,6 +588,10 @@ Status TabletCopyClient::Abort() {
   SCOPED_CLEANUP({
     DCHECK_EQ(tablet::TABLET_DATA_TOMBSTONED, meta_->tablet_data_state());
     state_ = kFinished;
+    if (dst_tablet_copy_metrics_) {
+      int64_t dur = GetCurrentTimeMicros() - start_time_micros_;
+      dst_tablet_copy_metrics_->copy_duration->Increment(dur / 1000);
+    }
   });
 
   // Load the in-progress superblock in-memory so that when we delete the
diff --git a/src/kudu/tserver/tablet_copy_client.h 
b/src/kudu/tserver/tablet_copy_client.h
index 38274aa55..0f1c8974d 100644
--- a/src/kudu/tserver/tablet_copy_client.h
+++ b/src/kudu/tserver/tablet_copy_client.h
@@ -77,6 +77,7 @@ struct TabletCopyClientMetrics {
 
   scoped_refptr<Counter> bytes_fetched;
   scoped_refptr<AtomicGauge<int32_t>> open_client_sessions;
+  scoped_refptr<Histogram> copy_duration;
 };
 
 // Client class for using tablet copy to copy a tablet from another host.
diff --git a/src/kudu/tserver/tablet_copy_service.cc 
b/src/kudu/tserver/tablet_copy_service.cc
index 036366e18..b318a227f 100644
--- a/src/kudu/tserver/tablet_copy_service.cc
+++ b/src/kudu/tserver/tablet_copy_service.cc
@@ -428,6 +428,7 @@ Status 
TabletCopyServiceImpl::DoEndTabletCopySessionUnlocked(
   sessions_lock_.AssertAcquired();
   scoped_refptr<RemoteTabletCopySourceSession> session;
   RETURN_NOT_OK(FindSessionUnlocked(session_id, app_error, &session));
+  session->UpdateTabletMetrics();
   // Remove the session from the map.
   // It will get destroyed once there are no outstanding refs.
   LOG_WITH_PREFIX(INFO) << "ending tablet copy session " << session_id << " on 
tablet "
diff --git a/src/kudu/tserver/tablet_copy_source_session.cc 
b/src/kudu/tserver/tablet_copy_source_session.cc
index 0c841bd32..4549e3a1f 100644
--- a/src/kudu/tserver/tablet_copy_source_session.cc
+++ b/src/kudu/tserver/tablet_copy_source_session.cc
@@ -73,6 +73,13 @@ METRIC_DEFINE_gauge_int32(server, 
tablet_copy_open_source_sessions,
                           "Number of currently open tablet copy source 
sessions on this server",
                           kudu::MetricLevel::kInfo);
 
+METRIC_DEFINE_histogram(server, tablet_copy_source_duration,
+                        "Source Tablet Copy Duration",
+                        kudu::MetricUnit::kMilliseconds,
+                        "Duration of tablet copying as source.",
+                        kudu::MetricLevel::kDebug,
+                        3600000LU, 1);
+
 DEFINE_int32(tablet_copy_session_inject_latency_on_init_ms, 0,
              "How much latency (in ms) to inject when a tablet copy session is 
initialized. "
              "(For testing only!)");
@@ -96,7 +103,8 @@ using tablet::TabletReplica;
 
 TabletCopySourceMetrics::TabletCopySourceMetrics(const 
scoped_refptr<MetricEntity>& metric_entity)
     : bytes_sent(METRIC_tablet_copy_bytes_sent.Instantiate(metric_entity)),
-      
open_source_sessions(METRIC_tablet_copy_open_source_sessions.Instantiate(metric_entity,
 0)) {
+      
open_source_sessions(METRIC_tablet_copy_open_source_sessions.Instantiate(metric_entity,
 0)),
+      
copy_duration(METRIC_tablet_copy_source_duration.Instantiate(metric_entity)) {
 }
 
 TabletCopySourceSession::TabletCopySourceSession(
@@ -124,6 +132,8 @@ Status TabletCopySourceSession::Init() {
 }
 
 Status RemoteTabletCopySourceSession::InitOnce() {
+  start_time_ = MonoTime::Now();
+
   // Inject latency during Init() for testing purposes.
   if (PREDICT_FALSE(FLAGS_tablet_copy_session_inject_latency_on_init_ms > 0)) {
     TRACE("Injecting $0ms of latency due to 
--tablet_copy_session_inject_latency_on_init_ms",
@@ -504,6 +514,13 @@ Status 
RemoteTabletCopySourceSession::UnregisterAnchorIfNeededUnlocked() {
   return 
tablet_replica_->log_anchor_registry()->UnregisterIfAnchored(&log_anchor_);
 }
 
+void RemoteTabletCopySourceSession::UpdateTabletMetrics() {
+  if (PREDICT_TRUE(start_time_.Initialized())) {
+    tablet_copy_metrics_->copy_duration->Increment(
+        (MonoTime::Now() - start_time_).ToMilliseconds());
+  }
+}
+
 LocalTabletCopySourceSession::LocalTabletCopySourceSession(
     string tablet_id,
     FsManager* fs_manager,
diff --git a/src/kudu/tserver/tablet_copy_source_session.h 
b/src/kudu/tserver/tablet_copy_source_session.h
index e7c0cb66b..09d0a1af5 100644
--- a/src/kudu/tserver/tablet_copy_source_session.h
+++ b/src/kudu/tserver/tablet_copy_source_session.h
@@ -37,6 +37,7 @@
 #include "kudu/tablet/metadata.pb.h"
 #include "kudu/tserver/tablet_copy.pb.h"
 #include "kudu/util/metrics.h"
+#include "kudu/util/monotime.h"
 #include "kudu/util/once.h"
 #include "kudu/util/slice.h"
 #include "kudu/util/status.h"
@@ -58,6 +59,7 @@ struct TabletCopySourceMetrics {
 
   scoped_refptr<Counter> bytes_sent;
   scoped_refptr<AtomicGauge<int32_t>> open_source_sessions;
+  scoped_refptr<Histogram> copy_duration;
 };
 
 // Caches file size and holds a shared_ptr reference to a RWFile.
@@ -245,12 +247,17 @@ class RemoteTabletCopySourceSession : public 
TabletCopySourceSession {
   // Unregister log anchor, if it's registered.
   Status UnregisterAnchorIfNeededUnlocked();
 
+  // Update tablet metrics.
+  void UpdateTabletMetrics();
+
  private:
   Status InitOnce() override;
 
   scoped_refptr<tablet::TabletReplica> tablet_replica_;
   const std::string session_id_;
   const std::string requestor_uuid_;
+  // this session's start time
+  MonoTime start_time_;
 };
 
 // Copy replica from local file system.

Reply via email to