This is an automated email from the ASF dual-hosted git repository.
alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git
The following commit(s) were added to refs/heads/master by this push:
new d370e0e45 [metrics] Add metrics for tablet copy op time
d370e0e45 is described below
commit d370e0e4511508790c065340a52242ee09ecfea3
Author: kedeng <[email protected]>
AuthorDate: Thu Apr 25 11:41:19 2024 +0800
[metrics] Add metrics for tablet copy op time
Add server-level statistics to track the time consumption of
copy tablet operations. This is effective both for the source
tablet and destination tablet during the copy operation.
The addition of monitoring items will aid in historical issue
tracking and analysis, as well as facilitate the configuration
of monitoring alarms.
Change-Id: I088f6a9a8a07ad39ca95ae8b4995ce00d1a0d00c
Reviewed-on: http://gerrit.cloudera.org:8080/21356
Reviewed-by: Alexey Serbin <[email protected]>
Tested-by: Alexey Serbin <[email protected]>
---
src/kudu/integration-tests/tablet_copy-itest.cc | 41 +++++++++++++++++++++++++
src/kudu/tserver/tablet_copy_client.cc | 18 ++++++++++-
src/kudu/tserver/tablet_copy_client.h | 1 +
src/kudu/tserver/tablet_copy_service.cc | 1 +
src/kudu/tserver/tablet_copy_source_session.cc | 19 +++++++++++-
src/kudu/tserver/tablet_copy_source_session.h | 7 +++++
6 files changed, 85 insertions(+), 2 deletions(-)
diff --git a/src/kudu/integration-tests/tablet_copy-itest.cc
b/src/kudu/integration-tests/tablet_copy-itest.cc
index ba3e4be45..c1a0856ad 100644
--- a/src/kudu/integration-tests/tablet_copy-itest.cc
+++ b/src/kudu/integration-tests/tablet_copy-itest.cc
@@ -149,6 +149,8 @@ METRIC_DECLARE_counter(glog_error_messages);
METRIC_DECLARE_counter(rows_inserted);
METRIC_DECLARE_counter(tablet_copy_bytes_fetched);
METRIC_DECLARE_counter(tablet_copy_bytes_sent);
+METRIC_DECLARE_histogram(tablet_copy_duration);
+METRIC_DECLARE_histogram(tablet_copy_source_duration);
METRIC_DECLARE_gauge_int32(tablet_copy_open_client_sessions);
METRIC_DECLARE_gauge_int32(tablet_copy_open_source_sessions);
METRIC_DECLARE_gauge_uint64(log_block_manager_blocks_under_management);
@@ -1649,6 +1651,30 @@ int64_t TabletCopyBytesFetched(ExternalTabletServer*
ets) {
return ret;
}
+int64_t TabletCopySourceDurationTotalCount(ExternalTabletServer* ets) {
+ int64_t ret;
+ CHECK_OK(GetInt64Metric(
+ ets->bound_http_hostport(),
+ &METRIC_ENTITY_server,
+ "kudu.tabletserver",
+ &METRIC_tablet_copy_source_duration,
+ "total_count",
+ &ret));
+ return ret;
+}
+
+int64_t TabletCopyDurationTotalCount(ExternalTabletServer* ets) {
+ int64_t ret;
+ CHECK_OK(GetInt64Metric(
+ ets->bound_http_hostport(),
+ &METRIC_ENTITY_server,
+ "kudu.tabletserver",
+ &METRIC_tablet_copy_duration,
+ "total_count",
+ &ret));
+ return ret;
+}
+
int64_t TabletCopyOpenSourceSessions(ExternalTabletServer* ets) {
int64_t ret;
CHECK_OK(GetInt64Metric(
@@ -1713,6 +1739,14 @@ TEST_F(TabletCopyITest, TestTabletCopyMetrics) {
follower_index = (leader_index + 1) % cluster_->num_tablet_servers();
follower_ts = ts_map_[cluster_->tablet_server(follower_index)->uuid()];
+ // Before we start the tablet copy, the metrics count should be zero.
+ int64_t copy_source_duration_cnt_before =
+
TabletCopySourceDurationTotalCount(cluster_->tablet_server(leader_index));
+ int64_t copy_duration_cnt_before =
+ TabletCopyDurationTotalCount(cluster_->tablet_server(follower_index));
+ ASSERT_EQ(0, copy_source_duration_cnt_before);
+ ASSERT_EQ(0, copy_duration_cnt_before);
+
LOG(INFO) << "Tombstoning follower tablet " << tablet_id
<< " on TS " << follower_ts->uuid();
ASSERT_OK(DeleteTablet(follower_ts, tablet_id, TABLET_DATA_TOMBSTONED,
kTimeout));
@@ -1744,6 +1778,13 @@ TEST_F(TabletCopyITest, TestTabletCopyMetrics) {
ASSERT_OK(WaitForServersToAgree(kTimeout, ts_map_, tablet_id,
workload.batches_completed()));
+ // After copying, the metrics count should be greater than zero.
+ int64_t copy_source_duration_cnt =
+
TabletCopySourceDurationTotalCount(cluster_->tablet_server(leader_index));
+ int64_t copy_duration_cnt =
TabletCopyDurationTotalCount(cluster_->tablet_server(follower_index));
+ ASSERT_GT(copy_source_duration_cnt, 0);
+ ASSERT_GT(copy_duration_cnt, 0);
+
ASSERT_EQ(0,
TabletCopyOpenSourceSessions(cluster_->tablet_server(leader_index)));
ASSERT_EQ(0,
TabletCopyOpenClientSessions(cluster_->tablet_server(follower_index)));
diff --git a/src/kudu/tserver/tablet_copy_client.cc
b/src/kudu/tserver/tablet_copy_client.cc
index fc49f42a1..d392c72c2 100644
--- a/src/kudu/tserver/tablet_copy_client.cc
+++ b/src/kudu/tserver/tablet_copy_client.cc
@@ -172,6 +172,13 @@ METRIC_DEFINE_gauge_int32(server,
tablet_copy_open_client_sessions,
"Number of currently open tablet copy client
sessions on this server",
kudu::MetricLevel::kInfo);
+METRIC_DEFINE_histogram(server, tablet_copy_duration,
+ "Tablet Copy Duration",
+ kudu::MetricUnit::kMilliseconds,
+ "Duration of tablet copying as destination.",
+ kudu::MetricLevel::kDebug,
+ 3600000LU, 1);
+
// RETURN_NOT_OK_PREPEND() with a remote-error unwinding step.
#define RETURN_NOT_OK_UNWIND_PREPEND(status, controller, msg) \
RETURN_NOT_OK_PREPEND(UnwindRemoteError(status, controller), msg)
@@ -206,7 +213,8 @@ namespace tserver {
TabletCopyClientMetrics::TabletCopyClientMetrics(const
scoped_refptr<MetricEntity>& metric_entity)
:
bytes_fetched(METRIC_tablet_copy_bytes_fetched.Instantiate(metric_entity)),
-
open_client_sessions(METRIC_tablet_copy_open_client_sessions.Instantiate(metric_entity,
0)) {
+
open_client_sessions(METRIC_tablet_copy_open_client_sessions.Instantiate(metric_entity,
0)),
+ copy_duration(METRIC_tablet_copy_duration.Instantiate(metric_entity)) {
}
TabletCopyClient::TabletCopyClient(
@@ -559,6 +567,10 @@ Status TabletCopyClient::Finish() {
// Now that we've finished everything, complete.
revert_activate_superblock.cancel();
state_ = kFinished;
+ if (dst_tablet_copy_metrics_) {
+ int64_t dur = GetCurrentTimeMicros() - start_time_micros_;
+ dst_tablet_copy_metrics_->copy_duration->Increment(dur / 1000);
+ }
return Status::OK();
}
@@ -576,6 +588,10 @@ Status TabletCopyClient::Abort() {
SCOPED_CLEANUP({
DCHECK_EQ(tablet::TABLET_DATA_TOMBSTONED, meta_->tablet_data_state());
state_ = kFinished;
+ if (dst_tablet_copy_metrics_) {
+ int64_t dur = GetCurrentTimeMicros() - start_time_micros_;
+ dst_tablet_copy_metrics_->copy_duration->Increment(dur / 1000);
+ }
});
// Load the in-progress superblock in-memory so that when we delete the
diff --git a/src/kudu/tserver/tablet_copy_client.h
b/src/kudu/tserver/tablet_copy_client.h
index 38274aa55..0f1c8974d 100644
--- a/src/kudu/tserver/tablet_copy_client.h
+++ b/src/kudu/tserver/tablet_copy_client.h
@@ -77,6 +77,7 @@ struct TabletCopyClientMetrics {
scoped_refptr<Counter> bytes_fetched;
scoped_refptr<AtomicGauge<int32_t>> open_client_sessions;
+ scoped_refptr<Histogram> copy_duration;
};
// Client class for using tablet copy to copy a tablet from another host.
diff --git a/src/kudu/tserver/tablet_copy_service.cc
b/src/kudu/tserver/tablet_copy_service.cc
index 036366e18..b318a227f 100644
--- a/src/kudu/tserver/tablet_copy_service.cc
+++ b/src/kudu/tserver/tablet_copy_service.cc
@@ -428,6 +428,7 @@ Status
TabletCopyServiceImpl::DoEndTabletCopySessionUnlocked(
sessions_lock_.AssertAcquired();
scoped_refptr<RemoteTabletCopySourceSession> session;
RETURN_NOT_OK(FindSessionUnlocked(session_id, app_error, &session));
+ session->UpdateTabletMetrics();
// Remove the session from the map.
// It will get destroyed once there are no outstanding refs.
LOG_WITH_PREFIX(INFO) << "ending tablet copy session " << session_id << " on
tablet "
diff --git a/src/kudu/tserver/tablet_copy_source_session.cc
b/src/kudu/tserver/tablet_copy_source_session.cc
index 0c841bd32..4549e3a1f 100644
--- a/src/kudu/tserver/tablet_copy_source_session.cc
+++ b/src/kudu/tserver/tablet_copy_source_session.cc
@@ -73,6 +73,13 @@ METRIC_DEFINE_gauge_int32(server,
tablet_copy_open_source_sessions,
"Number of currently open tablet copy source
sessions on this server",
kudu::MetricLevel::kInfo);
+METRIC_DEFINE_histogram(server, tablet_copy_source_duration,
+ "Source Tablet Copy Duration",
+ kudu::MetricUnit::kMilliseconds,
+ "Duration of tablet copying as source.",
+ kudu::MetricLevel::kDebug,
+ 3600000LU, 1);
+
DEFINE_int32(tablet_copy_session_inject_latency_on_init_ms, 0,
"How much latency (in ms) to inject when a tablet copy session is
initialized. "
"(For testing only!)");
@@ -96,7 +103,8 @@ using tablet::TabletReplica;
TabletCopySourceMetrics::TabletCopySourceMetrics(const
scoped_refptr<MetricEntity>& metric_entity)
: bytes_sent(METRIC_tablet_copy_bytes_sent.Instantiate(metric_entity)),
-
open_source_sessions(METRIC_tablet_copy_open_source_sessions.Instantiate(metric_entity,
0)) {
+
open_source_sessions(METRIC_tablet_copy_open_source_sessions.Instantiate(metric_entity,
0)),
+
copy_duration(METRIC_tablet_copy_source_duration.Instantiate(metric_entity)) {
}
TabletCopySourceSession::TabletCopySourceSession(
@@ -124,6 +132,8 @@ Status TabletCopySourceSession::Init() {
}
Status RemoteTabletCopySourceSession::InitOnce() {
+ start_time_ = MonoTime::Now();
+
// Inject latency during Init() for testing purposes.
if (PREDICT_FALSE(FLAGS_tablet_copy_session_inject_latency_on_init_ms > 0)) {
TRACE("Injecting $0ms of latency due to
--tablet_copy_session_inject_latency_on_init_ms",
@@ -504,6 +514,13 @@ Status
RemoteTabletCopySourceSession::UnregisterAnchorIfNeededUnlocked() {
return
tablet_replica_->log_anchor_registry()->UnregisterIfAnchored(&log_anchor_);
}
+void RemoteTabletCopySourceSession::UpdateTabletMetrics() {
+ if (PREDICT_TRUE(start_time_.Initialized())) {
+ tablet_copy_metrics_->copy_duration->Increment(
+ (MonoTime::Now() - start_time_).ToMilliseconds());
+ }
+}
+
LocalTabletCopySourceSession::LocalTabletCopySourceSession(
string tablet_id,
FsManager* fs_manager,
diff --git a/src/kudu/tserver/tablet_copy_source_session.h
b/src/kudu/tserver/tablet_copy_source_session.h
index e7c0cb66b..09d0a1af5 100644
--- a/src/kudu/tserver/tablet_copy_source_session.h
+++ b/src/kudu/tserver/tablet_copy_source_session.h
@@ -37,6 +37,7 @@
#include "kudu/tablet/metadata.pb.h"
#include "kudu/tserver/tablet_copy.pb.h"
#include "kudu/util/metrics.h"
+#include "kudu/util/monotime.h"
#include "kudu/util/once.h"
#include "kudu/util/slice.h"
#include "kudu/util/status.h"
@@ -58,6 +59,7 @@ struct TabletCopySourceMetrics {
scoped_refptr<Counter> bytes_sent;
scoped_refptr<AtomicGauge<int32_t>> open_source_sessions;
+ scoped_refptr<Histogram> copy_duration;
};
// Caches file size and holds a shared_ptr reference to a RWFile.
@@ -245,12 +247,17 @@ class RemoteTabletCopySourceSession : public
TabletCopySourceSession {
// Unregister log anchor, if it's registered.
Status UnregisterAnchorIfNeededUnlocked();
+ // Update tablet metrics.
+ void UpdateTabletMetrics();
+
private:
Status InitOnce() override;
scoped_refptr<tablet::TabletReplica> tablet_replica_;
const std::string session_id_;
const std::string requestor_uuid_;
+ // this session's start time
+ MonoTime start_time_;
};
// Copy replica from local file system.