This is an automated email from the ASF dual-hosted git repository.

wangdan pushed a commit to branch migrate-metrics-dev
in repository https://gitbox.apache.org/repos/asf/incubator-pegasus.git

commit 048bda9c6dbea889243f529998c59e15a765dc20
Author: Dan Wang <[email protected]>
AuthorDate: Wed Apr 26 11:03:06 2023 +0800

    feat(new_metrics): migrate metrics for replica_stub (part 2) (#1459)
    
    https://github.com/apache/incubator-pegasus/issues/1454
    
    This is the 2nd part of migrating metrics of `replica_stub` to new 
framework,
    all of which are learn-related.
    
    During this migration, there are 3 metrics still keeping server-level, 
including
    the number of learning replicas, the max duration and the max size of files 
that
    are copied from learnee among all learning replicas.
    
    Another 11 metrics are changed from server-level to replica-level, since 
they
    should be observed for each replica. All of them are observed from the view
    of learners, namely the potential secondary replica. The learnee is the 
primary
    replica. These metrics include: the number of learns launched by learner, 
the
    number of learn rounds launched by learner (during a learn there might be
    multiple rounds, the number of files that are copied from learnee, the size 
of
    files that are copied from learnee, the size of data that are copied from 
learnee's
    buffer, the number of learn responses of `LT_CACHE`, `LT_APP` and `LT_LOG`
    type decided by learner with each learn response related to an `RPC_LEARN`
    request, the number of times learner resets its local state (since its 
local state
    is newer than learnee's) with each reset related to an learn response of an
    `RPC_LEARN` request, the number of failed and successful learns launched by
    learner.
---
 src/replica/replica.cpp       | 75 ++++++++++++++++++++++++++++++++-
 src/replica/replica.h         | 12 ++++++
 src/replica/replica_learn.cpp | 39 ++++++++++-------
 src/replica/replica_stub.cpp  | 98 ++++++++++---------------------------------
 src/replica/replica_stub.h    | 17 ++------
 src/utils/metrics.h           |  4 ++
 6 files changed, 141 insertions(+), 104 deletions(-)

diff --git a/src/replica/replica.cpp b/src/replica/replica.cpp
index fd8b73d3c..ad458c543 100644
--- a/src/replica/replica.cpp
+++ b/src/replica/replica.cpp
@@ -127,6 +127,68 @@ METRIC_DEFINE_counter(replica,
                       dsn::metric_unit::kRequests,
                       "The number of rejected non-idempotent write requests by 
duplication");
 
+METRIC_DEFINE_counter(
+    replica,
+    learn_count,
+    dsn::metric_unit::kLearns,
+    "The number of learns launched by learner (i.e. potential secondary 
replica)");
+
+METRIC_DEFINE_counter(replica,
+                      learn_rounds,
+                      dsn::metric_unit::kRounds,
+                      "The number of learn rounds launched by learner (during 
a learn there might"
+                      "be multiple rounds)");
+
+METRIC_DEFINE_counter(replica,
+                      learn_copy_files,
+                      dsn::metric_unit::kFiles,
+                      "The number of files that are copied from learnee (i.e. 
primary replica)");
+
+METRIC_DEFINE_counter(replica,
+                      learn_copy_file_bytes,
+                      dsn::metric_unit::kBytes,
+                      "The size of file that are copied from learnee");
+
+METRIC_DEFINE_counter(replica,
+                      learn_copy_buffer_bytes,
+                      dsn::metric_unit::kBytes,
+                      "The size of data that are copied from learnee's 
buffer");
+
+METRIC_DEFINE_counter(replica,
+                      learn_lt_cache_responses,
+                      dsn::metric_unit::kResponses,
+                      "The number of learn responses of LT_CACHE type decided 
by learner, with "
+                      "each learn response related to an RPC_LEARN request");
+
+METRIC_DEFINE_counter(replica,
+                      learn_lt_app_responses,
+                      dsn::metric_unit::kResponses,
+                      "The number of learn responses of LT_APP type decided by 
learner, with each "
+                      "learn response related to an RPC_LEARN request");
+
+METRIC_DEFINE_counter(replica,
+                      learn_lt_log_responses,
+                      dsn::metric_unit::kResponses,
+                      "The number of learn responses of LT_LOG type decided by 
learner, with each "
+                      "learn response related to an RPC_LEARN request");
+
+METRIC_DEFINE_counter(replica,
+                      learn_resets,
+                      dsn::metric_unit::kResets,
+                      "The number of times learner resets its local state 
(since its local state "
+                      "is newer than learnee's), with each reset related to an 
learn response of "
+                      "an RPC_LEARN request");
+
+METRIC_DEFINE_counter(replica,
+                      learn_failed_count,
+                      dsn::metric_unit::kLearns,
+                      "The number of failed learns launched by learner");
+
+METRIC_DEFINE_counter(replica,
+                      learn_successful_count,
+                      dsn::metric_unit::kLearns,
+                      "The number of successful learns launched by learner");
+
 namespace dsn {
 namespace replication {
 
@@ -191,7 +253,18 @@ replica::replica(replica_stub *stub,
       METRIC_VAR_INIT_replica(splitting_rejected_write_requests),
       METRIC_VAR_INIT_replica(splitting_rejected_read_requests),
       METRIC_VAR_INIT_replica(bulk_load_ingestion_rejected_write_requests),
-      METRIC_VAR_INIT_replica(dup_rejected_non_idempotent_write_requests)
+      METRIC_VAR_INIT_replica(dup_rejected_non_idempotent_write_requests),
+      METRIC_VAR_INIT_replica(learn_count),
+      METRIC_VAR_INIT_replica(learn_rounds),
+      METRIC_VAR_INIT_replica(learn_copy_files),
+      METRIC_VAR_INIT_replica(learn_copy_file_bytes),
+      METRIC_VAR_INIT_replica(learn_copy_buffer_bytes),
+      METRIC_VAR_INIT_replica(learn_lt_cache_responses),
+      METRIC_VAR_INIT_replica(learn_lt_app_responses),
+      METRIC_VAR_INIT_replica(learn_lt_log_responses),
+      METRIC_VAR_INIT_replica(learn_resets),
+      METRIC_VAR_INIT_replica(learn_failed_count),
+      METRIC_VAR_INIT_replica(learn_successful_count)
 {
     CHECK(!_app_info.app_type.empty(), "");
     CHECK_NOTNULL(stub, "");
diff --git a/src/replica/replica.h b/src/replica/replica.h
index 91e7896dd..8df71a964 100644
--- a/src/replica/replica.h
+++ b/src/replica/replica.h
@@ -668,6 +668,18 @@ private:
     METRIC_VAR_DECLARE_counter(dup_rejected_non_idempotent_write_requests);
     std::vector<perf_counter *> _counters_table_level_latency;
 
+    METRIC_VAR_DECLARE_counter(learn_count);
+    METRIC_VAR_DECLARE_counter(learn_rounds);
+    METRIC_VAR_DECLARE_counter(learn_copy_files);
+    METRIC_VAR_DECLARE_counter(learn_copy_file_bytes);
+    METRIC_VAR_DECLARE_counter(learn_copy_buffer_bytes);
+    METRIC_VAR_DECLARE_counter(learn_lt_cache_responses);
+    METRIC_VAR_DECLARE_counter(learn_lt_app_responses);
+    METRIC_VAR_DECLARE_counter(learn_lt_log_responses);
+    METRIC_VAR_DECLARE_counter(learn_resets);
+    METRIC_VAR_DECLARE_counter(learn_failed_count);
+    METRIC_VAR_DECLARE_counter(learn_successful_count);
+
     dsn::task_tracker _tracker;
     // the thread access checker
     dsn::thread_access_checker _checker;
diff --git a/src/replica/replica_learn.cpp b/src/replica/replica_learn.cpp
index 75fc9c589..b3f6f6a1a 100644
--- a/src/replica/replica_learn.cpp
+++ b/src/replica/replica_learn.cpp
@@ -58,8 +58,6 @@
 #include "mutation.h"
 #include "mutation_log.h"
 #include "nfs/nfs_node.h"
-#include "perf_counter/perf_counter.h"
-#include "perf_counter/perf_counter_wrapper.h"
 #include "replica.h"
 #include "replica/duplication/replica_duplicator_manager.h"
 #include "replica/prepare_list.h"
@@ -80,8 +78,21 @@
 #include "utils/filesystem.h"
 #include "utils/flags.h"
 #include "utils/fmt_logging.h"
+#include "utils/metrics.h"
 #include "utils/thread_access_checker.h"
 
+METRIC_DECLARE_counter(learn_count);
+METRIC_DECLARE_counter(learn_rounds);
+METRIC_DECLARE_counter(learn_copy_files);
+METRIC_DECLARE_counter(learn_copy_file_bytes);
+METRIC_DECLARE_counter(learn_copy_buffer_bytes);
+METRIC_DECLARE_counter(learn_lt_cache_responses);
+METRIC_DECLARE_counter(learn_lt_app_responses);
+METRIC_DECLARE_counter(learn_lt_log_responses);
+METRIC_DECLARE_counter(learn_resets);
+METRIC_DECLARE_counter(learn_failed_count);
+METRIC_DECLARE_counter(learn_successful_count);
+
 namespace dsn {
 namespace replication {
 
@@ -137,7 +148,7 @@ void replica::init_learn(uint64_t signature)
             return;
         }
 
-        _stub->_counter_replicas_learning_recent_start_count->increment();
+        METRIC_VAR_INCREMENT(learn_count);
 
         _potential_secondary_states.learning_version = signature;
         _potential_secondary_states.learning_start_ts_ns = dsn_now_ns();
@@ -177,7 +188,7 @@ void replica::init_learn(uint64_t signature)
 
                     // missed ones need to be loaded via private logs
                     else {
-                        
_stub->_counter_replicas_learning_recent_round_start_count->increment();
+                        METRIC_VAR_INCREMENT(learn_rounds);
                         _potential_secondary_states.learning_round_is_running 
= true;
                         
_potential_secondary_states.catchup_with_private_log_task =
                             tasking::create_task(LPC_CATCHUP_WITH_PRIVATE_LOGS,
@@ -231,7 +242,7 @@ void replica::init_learn(uint64_t signature)
         return;
     }
 
-    _stub->_counter_replicas_learning_recent_round_start_count->increment();
+    METRIC_VAR_INCREMENT(learn_rounds);
     _potential_secondary_states.learning_round_is_running = true;
 
     learn_request request;
@@ -602,7 +613,7 @@ void replica::on_learn_reply(error_code err, learn_request 
&&req, learn_response
         enum_to_string(_potential_secondary_states.learning_status));
 
     _potential_secondary_states.learning_copy_buffer_size += 
resp.state.meta.length();
-    
_stub->_counter_replicas_learning_recent_copy_buffer_size->add(resp.state.meta.length());
+    METRIC_VAR_INCREMENT_BY(learn_copy_buffer_bytes, resp.state.meta.length());
 
     if (resp.err != ERR_OK) {
         if (resp.err == ERR_INACTIVE_STATE || resp.err == 
ERR_INCONSISTENT_STATE) {
@@ -650,7 +661,7 @@ void replica::on_learn_reply(error_code err, learn_request 
&&req, learn_response
                            _app->last_committed_decree(),
                            resp.last_committed_decree);
 
-        
_stub->_counter_replicas_learning_recent_learn_reset_count->increment();
+        METRIC_VAR_INCREMENT(learn_resets);
 
         // close app
         auto err = _app->close(true);
@@ -743,13 +754,13 @@ void replica::on_learn_reply(error_code err, 
learn_request &&req, learn_response
 
     switch (resp.type) {
     case learn_type::LT_CACHE:
-        
_stub->_counter_replicas_learning_recent_learn_cache_count->increment();
+        METRIC_VAR_INCREMENT(learn_lt_cache_responses);
         break;
     case learn_type::LT_APP:
-        _stub->_counter_replicas_learning_recent_learn_app_count->increment();
+        METRIC_VAR_INCREMENT(learn_lt_app_responses);
         break;
     case learn_type::LT_LOG:
-        _stub->_counter_replicas_learning_recent_learn_log_count->increment();
+        METRIC_VAR_INCREMENT(learn_lt_log_responses);
         break;
     default:
         // do nothing
@@ -1044,8 +1055,8 @@ void replica::on_copy_remote_state_completed(error_code 
err,
     if (err == ERR_OK) {
         _potential_secondary_states.learning_copy_file_count += 
resp.state.files.size();
         _potential_secondary_states.learning_copy_file_size += size;
-        
_stub->_counter_replicas_learning_recent_copy_file_count->add(resp.state.files.size());
-        _stub->_counter_replicas_learning_recent_copy_file_size->add(size);
+        METRIC_VAR_INCREMENT_BY(learn_copy_files, resp.state.files.size());
+        METRIC_VAR_INCREMENT_BY(learn_copy_file_bytes, size);
     }
 
     if (err != ERR_OK) {
@@ -1246,7 +1257,7 @@ void replica::handle_learning_error(error_code err, bool 
is_local_error)
         }
     }
 
-    _stub->_counter_replicas_learning_recent_learn_fail_count->increment();
+    METRIC_VAR_INCREMENT(learn_failed_count);
 
     update_local_configuration_with_no_ballot_change(
         is_local_error ? partition_status::PS_ERROR : 
partition_status::PS_INACTIVE);
@@ -1410,7 +1421,7 @@ void 
replica::on_learn_completion_notification_reply(error_code err,
             handle_learning_error(resp.err, false);
         }
     } else {
-        _stub->_counter_replicas_learning_recent_learn_succ_count->increment();
+        METRIC_VAR_INCREMENT(learn_successful_count);
     }
 }
 
diff --git a/src/replica/replica_stub.cpp b/src/replica/replica_stub.cpp
index bf60d6a8c..9c00fc6e3 100644
--- a/src/replica/replica_stub.cpp
+++ b/src/replica/replica_stub.cpp
@@ -112,6 +112,22 @@ METRIC_DEFINE_gauge_int64(server,
                           dsn::metric_unit::kReplicas,
                           "The number of closing replicas");
 
+METRIC_DEFINE_gauge_int64(server,
+                          learning_replicas,
+                          dsn::metric_unit::kReplicas,
+                          "The number of learning replicas");
+
+METRIC_DEFINE_gauge_int64(server,
+                          learning_replicas_max_duration_ms,
+                          dsn::metric_unit::kMilliSeconds,
+                          "The max duration among all learning replicas");
+
+METRIC_DEFINE_gauge_int64(
+    server,
+    learning_replicas_max_copy_file_bytes,
+    dsn::metric_unit::kBytes,
+    "The max size of files that are copied from learnee among all learning 
replicas");
+
 namespace dsn {
 namespace replication {
 DSN_DEFINE_bool(replication,
@@ -220,7 +236,10 @@ replica_stub::replica_stub(replica_state_subscriber 
subscriber /*= nullptr*/,
       _is_running(false),
       METRIC_VAR_INIT_server(total_replicas),
       METRIC_VAR_INIT_server(opening_replicas),
-      METRIC_VAR_INIT_server(closing_replicas)
+      METRIC_VAR_INIT_server(closing_replicas),
+      METRIC_VAR_INIT_server(learning_replicas),
+      METRIC_VAR_INIT_server(learning_replicas_max_duration_ms),
+      METRIC_VAR_INIT_server(learning_replicas_max_copy_file_bytes)
 {
 #ifdef DSN_ENABLE_GPERF
     _is_releasing_memory = false;
@@ -238,77 +257,6 @@ replica_stub::~replica_stub(void) { close(); }
 
 void replica_stub::install_perf_counters()
 {
-    _counter_replicas_learning_count.init_app_counter("eon.replica_stub",
-                                                      
"replicas.learning.count",
-                                                      COUNTER_TYPE_NUMBER,
-                                                      "current learning 
count");
-    _counter_replicas_learning_max_duration_time_ms.init_app_counter(
-        "eon.replica_stub",
-        "replicas.learning.max.duration.time(ms)",
-        COUNTER_TYPE_NUMBER,
-        "current learning max duration time(ms)");
-    _counter_replicas_learning_max_copy_file_size.init_app_counter(
-        "eon.replica_stub",
-        "replicas.learning.max.copy.file.size",
-        COUNTER_TYPE_NUMBER,
-        "current learning max copy file size");
-    _counter_replicas_learning_recent_start_count.init_app_counter(
-        "eon.replica_stub",
-        "replicas.learning.recent.start.count",
-        COUNTER_TYPE_VOLATILE_NUMBER,
-        "current learning start count in the recent period");
-    _counter_replicas_learning_recent_round_start_count.init_app_counter(
-        "eon.replica_stub",
-        "replicas.learning.recent.round.start.count",
-        COUNTER_TYPE_VOLATILE_NUMBER,
-        "learning round start count in the recent period");
-    _counter_replicas_learning_recent_copy_file_count.init_app_counter(
-        "eon.replica_stub",
-        "replicas.learning.recent.copy.file.count",
-        COUNTER_TYPE_VOLATILE_NUMBER,
-        "learning copy file count in the recent period");
-    _counter_replicas_learning_recent_copy_file_size.init_app_counter(
-        "eon.replica_stub",
-        "replicas.learning.recent.copy.file.size",
-        COUNTER_TYPE_VOLATILE_NUMBER,
-        "learning copy file size in the recent period");
-    _counter_replicas_learning_recent_copy_buffer_size.init_app_counter(
-        "eon.replica_stub",
-        "replicas.learning.recent.copy.buffer.size",
-        COUNTER_TYPE_VOLATILE_NUMBER,
-        "learning copy buffer size in the recent period");
-    _counter_replicas_learning_recent_learn_cache_count.init_app_counter(
-        "eon.replica_stub",
-        "replicas.learning.recent.learn.cache.count",
-        COUNTER_TYPE_VOLATILE_NUMBER,
-        "learning LT_CACHE count in the recent period");
-    _counter_replicas_learning_recent_learn_app_count.init_app_counter(
-        "eon.replica_stub",
-        "replicas.learning.recent.learn.app.count",
-        COUNTER_TYPE_VOLATILE_NUMBER,
-        "learning LT_APP count in the recent period");
-    _counter_replicas_learning_recent_learn_log_count.init_app_counter(
-        "eon.replica_stub",
-        "replicas.learning.recent.learn.log.count",
-        COUNTER_TYPE_VOLATILE_NUMBER,
-        "learning LT_LOG count in the recent period");
-    _counter_replicas_learning_recent_learn_reset_count.init_app_counter(
-        "eon.replica_stub",
-        "replicas.learning.recent.learn.reset.count",
-        COUNTER_TYPE_VOLATILE_NUMBER,
-        "learning reset count in the recent period"
-        "for the reason of resp.last_committed_decree < 
_app->last_committed_decree()");
-    _counter_replicas_learning_recent_learn_fail_count.init_app_counter(
-        "eon.replica_stub",
-        "replicas.learning.recent.learn.fail.count",
-        COUNTER_TYPE_VOLATILE_NUMBER,
-        "learning fail count in the recent period");
-    _counter_replicas_learning_recent_learn_succ_count.init_app_counter(
-        "eon.replica_stub",
-        "replicas.learning.recent.learn.succ.count",
-        COUNTER_TYPE_VOLATILE_NUMBER,
-        "learning succeed count in the recent period");
-
     _counter_replicas_recent_prepare_fail_count.init_app_counter(
         "eon.replica_stub",
         "replicas.recent.prepare.fail.count",
@@ -1957,9 +1905,9 @@ void replica_stub::on_gc()
         }
     }
 
-    _counter_replicas_learning_count->set(learning_count);
-    
_counter_replicas_learning_max_duration_time_ms->set(learning_max_duration_time_ms);
-    
_counter_replicas_learning_max_copy_file_size->set(learning_max_copy_file_size);
+    METRIC_VAR_SET(learning_replicas, learning_count);
+    METRIC_VAR_SET(learning_replicas_max_duration_ms, 
learning_max_duration_time_ms);
+    METRIC_VAR_SET(learning_replicas_max_copy_file_bytes, 
learning_max_copy_file_size);
     _counter_cold_backup_running_count->set(cold_backup_running_count);
     
_counter_cold_backup_max_duration_time_ms->set(cold_backup_max_duration_time_ms);
     
_counter_cold_backup_max_upload_file_size->set(cold_backup_max_upload_file_size);
diff --git a/src/replica/replica_stub.h b/src/replica/replica_stub.h
index 7e4d75c60..ebfc166b6 100644
--- a/src/replica/replica_stub.h
+++ b/src/replica/replica_stub.h
@@ -499,20 +499,9 @@ private:
     METRIC_VAR_DECLARE_gauge_int64(opening_replicas);
     METRIC_VAR_DECLARE_gauge_int64(closing_replicas);
 
-    perf_counter_wrapper _counter_replicas_learning_count;
-    perf_counter_wrapper _counter_replicas_learning_max_duration_time_ms;
-    perf_counter_wrapper _counter_replicas_learning_max_copy_file_size;
-    perf_counter_wrapper _counter_replicas_learning_recent_start_count;
-    perf_counter_wrapper _counter_replicas_learning_recent_round_start_count;
-    perf_counter_wrapper _counter_replicas_learning_recent_copy_file_count;
-    perf_counter_wrapper _counter_replicas_learning_recent_copy_file_size;
-    perf_counter_wrapper _counter_replicas_learning_recent_copy_buffer_size;
-    perf_counter_wrapper _counter_replicas_learning_recent_learn_cache_count;
-    perf_counter_wrapper _counter_replicas_learning_recent_learn_app_count;
-    perf_counter_wrapper _counter_replicas_learning_recent_learn_log_count;
-    perf_counter_wrapper _counter_replicas_learning_recent_learn_reset_count;
-    perf_counter_wrapper _counter_replicas_learning_recent_learn_fail_count;
-    perf_counter_wrapper _counter_replicas_learning_recent_learn_succ_count;
+    METRIC_VAR_DECLARE_gauge_int64(learning_replicas);
+    METRIC_VAR_DECLARE_gauge_int64(learning_replicas_max_duration_ms);
+    METRIC_VAR_DECLARE_gauge_int64(learning_replicas_max_copy_file_bytes);
 
     perf_counter_wrapper _counter_replicas_recent_prepare_fail_count;
     perf_counter_wrapper _counter_replicas_recent_replica_move_error_count;
diff --git a/src/utils/metrics.h b/src/utils/metrics.h
index f9ab9c2dc..b5e31c050 100644
--- a/src/utils/metrics.h
+++ b/src/utils/metrics.h
@@ -656,6 +656,7 @@ enum class metric_unit : size_t
     kReplicas,
     kServers,
     kRequests,
+    kResponses,
     kSeeks,
     kPointLookups,
     kValues,
@@ -669,6 +670,9 @@ enum class metric_unit : size_t
     kOperations,
     kTasks,
     kDisconnections,
+    kLearns,
+    kRounds,
+    kResets,
     kInvalidUnit,
 };
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to