This is an automated email from the ASF dual-hosted git repository. wangdan pushed a commit to branch migrate-metrics-dev in repository https://gitbox.apache.org/repos/asf/incubator-pegasus.git
commit 410043ce53a6ffe0a06f662c81c5b67d23223bef Author: Dan Wang <[email protected]> AuthorDate: Thu May 11 11:32:43 2023 +0800 feat(new_metrics): migrate metrics for replica_stub (part 5) (#1469) https://github.com/apache/incubator-pegasus/issues/1454 This is the 5th part of migrating metrics of replica_stub to new framework, most of which are cold-backup-related. During this migration, there are 10 metrics which are changed from server-level to replica-level, including the number of current running backups, the max backup duration among backups, the max size of uploaded files among backups, the number of started/failed/successful/cancelled backups, the number of failed/successful file uploads for backups, the total size of uploaded files for backups. The metric tracking the number of paused cold backup has been removed, since it's not in use. `tcmalloc_released_bytes` is changed from gauge to counter, since the counter would be a more accurate reflection of recently released memory by tcmalloc. --- src/meta/meta_backup_service.cpp | 8 +- src/meta/meta_backup_service.h | 4 +- src/replica/backup/cold_backup_context.cpp | 28 +++---- src/replica/backup/replica_backup_manager.cpp | 32 +++++-- src/replica/backup/replica_backup_manager.h | 9 +- src/replica/replica.cpp | 47 ++++++++++- src/replica/replica.h | 19 +++-- src/replica/replica_backup.cpp | 7 +- src/replica/replica_stub.cpp | 116 ++++++-------------------- src/replica/replica_stub.h | 14 +--- src/utils/metrics.h | 2 + 11 files changed, 140 insertions(+), 146 deletions(-) diff --git a/src/meta/meta_backup_service.cpp b/src/meta/meta_backup_service.cpp index cfdddbb53..832158161 100644 --- a/src/meta/meta_backup_service.cpp +++ b/src/meta/meta_backup_service.cpp @@ -55,9 +55,9 @@ METRIC_DEFINE_entity(backup_policy); METRIC_DEFINE_gauge_int64(backup_policy, - policy_recent_backup_duration_ms, + backup_recent_duration_ms, dsn::metric_unit::kMilliSeconds, - "The recent backup duration"); + "The duration of recent backup"); namespace dsn { namespace replication { @@ -78,7 +78,7 @@ metric_entity_ptr instantiate_backup_policy_metric_entity(const std::string &pol backup_policy_metrics::backup_policy_metrics(const std::string &policy_name) : _backup_policy_metric_entity(instantiate_backup_policy_metric_entity(policy_name)), - METRIC_VAR_INIT_backup_policy(policy_recent_backup_duration_ms) + METRIC_VAR_INIT_backup_policy(backup_recent_duration_ms) { } @@ -1039,7 +1039,7 @@ void policy_context::issue_gc_backup_info_task_unlocked() last_backup_duration_time_ms = (_cur_backup.end_time_ms - _cur_backup.start_time_ms); } } - METRIC_SET(*_metrics, policy_recent_backup_duration_ms, last_backup_duration_time_ms); + METRIC_SET(*_metrics, backup_recent_duration_ms, last_backup_duration_time_ms); } void policy_context::sync_remove_backup_info(const backup_info &info, dsn::task_ptr sync_callback) diff --git a/src/meta/meta_backup_service.h b/src/meta/meta_backup_service.h index 021b99552..14afbf7f4 100644 --- a/src/meta/meta_backup_service.h +++ b/src/meta/meta_backup_service.h @@ -184,11 +184,11 @@ public: const metric_entity_ptr &backup_policy_metric_entity() const; - METRIC_DEFINE_SET(policy_recent_backup_duration_ms, int64_t) + METRIC_DEFINE_SET(backup_recent_duration_ms, int64_t) private: const metric_entity_ptr _backup_policy_metric_entity; - METRIC_VAR_DECLARE_gauge_int64(policy_recent_backup_duration_ms); + METRIC_VAR_DECLARE_gauge_int64(backup_recent_duration_ms); DISALLOW_COPY_AND_ASSIGN(backup_policy_metrics); }; diff --git a/src/replica/backup/cold_backup_context.cpp b/src/replica/backup/cold_backup_context.cpp index f9063a896..5ff49e862 100644 --- a/src/replica/backup/cold_backup_context.cpp +++ b/src/replica/backup/cold_backup_context.cpp @@ -24,15 +24,13 @@ #include "common/backup_common.h" #include "common/replication.codes.h" -#include "perf_counter/perf_counter.h" -#include "perf_counter/perf_counter_wrapper.h" #include "replica/replica.h" -#include "replica/replica_stub.h" #include "runtime/api_layer1.h" #include "runtime/task/async_calls.h" #include "utils/blob.h" #include "utils/error_code.h" #include "utils/filesystem.h" +#include "utils/metrics.h" #include "utils/utils.h" namespace dsn { @@ -71,7 +69,7 @@ void cold_backup_context::cancel() { _status.store(ColdBackupCanceled); if (_owner_replica != nullptr) { - _owner_replica->get_replica_stub()->_counter_cold_backup_recent_cancel_count->increment(); + METRIC_INCREMENT(*_owner_replica, backup_cancelled_count); } } @@ -93,7 +91,7 @@ bool cold_backup_context::fail_check(const char *failure_reason) strncpy(_reason, failure_reason, sizeof(_reason) - 1); _reason[sizeof(_reason) - 1] = '\0'; if (_owner_replica != nullptr) { - _owner_replica->get_replica_stub()->_counter_cold_backup_recent_fail_count->increment(); + METRIC_INCREMENT(*_owner_replica, backup_failed_count); } return true; } else { @@ -107,7 +105,7 @@ bool cold_backup_context::complete_check(bool uploaded) if (uploaded) { _progress.store(cold_backup_constant::PROGRESS_FINISHED); if (_owner_replica != nullptr) { - _owner_replica->get_replica_stub()->_counter_cold_backup_recent_succ_count->increment(); + METRIC_INCREMENT(*_owner_replica, backup_successful_count); } return _status.compare_exchange_strong(checking, ColdBackupCompleted); } else { @@ -132,7 +130,7 @@ bool cold_backup_context::fail_checkpoint(const char *failure_reason) strncpy(_reason, failure_reason, sizeof(_reason) - 1); _reason[sizeof(_reason) - 1] = '\0'; if (_owner_replica != nullptr) { - _owner_replica->get_replica_stub()->_counter_cold_backup_recent_fail_count->increment(); + METRIC_INCREMENT(*_owner_replica, backup_failed_count); } return true; } else { @@ -158,7 +156,7 @@ bool cold_backup_context::fail_upload(const char *failure_reason) strncpy(_reason, failure_reason, sizeof(_reason) - 1); _reason[sizeof(_reason) - 1] = '\0'; if (_owner_replica != nullptr) { - _owner_replica->get_replica_stub()->_counter_cold_backup_recent_fail_count->increment(); + METRIC_INCREMENT(*_owner_replica, backup_failed_count); } return true; } else { @@ -174,7 +172,7 @@ bool cold_backup_context::complete_upload() _status.compare_exchange_strong(paused, ColdBackupCompleted)) { _progress.store(cold_backup_constant::PROGRESS_FINISHED); if (_owner_replica != nullptr) { - _owner_replica->get_replica_stub()->_counter_cold_backup_recent_succ_count->increment(); + METRIC_INCREMENT(*_owner_replica, backup_successful_count); } return true; } else { @@ -720,8 +718,7 @@ void cold_backup_context::upload_file(const std::string &local_filename) fail_upload("create file failed"); } if (resp.err != ERR_OK && _owner_replica != nullptr) { - _owner_replica->get_replica_stub() - ->_counter_cold_backup_recent_upload_file_fail_count->increment(); + METRIC_INCREMENT(*_owner_replica, backup_file_upload_failed_count); } release_ref(); return; @@ -783,8 +780,7 @@ void cold_backup_context::on_upload(const dist::block_service::block_file_ptr &f fail_upload("upload checkpoint file to remote failed"); } if (resp.err != ERR_OK && _owner_replica != nullptr) { - _owner_replica->get_replica_stub() - ->_counter_cold_backup_recent_upload_file_fail_count->increment(); + METRIC_INCREMENT(*_owner_replica, backup_file_upload_failed_count); } release_ref(); return; @@ -1003,10 +999,8 @@ void cold_backup_context::on_upload_file_complete(const std::string &local_filen _upload_file_size.fetch_add(f_size); file_upload_complete(local_filename); if (_owner_replica != nullptr) { - _owner_replica->get_replica_stub() - ->_counter_cold_backup_recent_upload_file_succ_count->increment(); - _owner_replica->get_replica_stub()->_counter_cold_backup_recent_upload_file_size->add( - f_size); + METRIC_INCREMENT(*_owner_replica, backup_file_upload_successful_count); + METRIC_INCREMENT_BY(*_owner_replica, backup_file_upload_total_bytes, f_size); } // update progress // int a = 10; int b = 3; then b/a = 0; diff --git a/src/replica/backup/replica_backup_manager.cpp b/src/replica/backup/replica_backup_manager.cpp index 0d34afa2d..6ab2af844 100644 --- a/src/replica/backup/replica_backup_manager.cpp +++ b/src/replica/backup/replica_backup_manager.cpp @@ -19,7 +19,6 @@ #include <stdint.h> #include <algorithm> -#include <atomic> #include <chrono> #include <map> #include <memory> @@ -41,9 +40,25 @@ #include "utils/filesystem.h" #include "utils/flags.h" #include "utils/fmt_logging.h" +#include "utils/string_view.h" #include "utils/strings.h" #include "utils/thread_access_checker.h" +METRIC_DEFINE_gauge_int64(replica, + backup_running_count, + dsn::metric_unit::kBackups, + "The number of current running backups"); + +METRIC_DEFINE_gauge_int64(replica, + backup_max_duration_ms, + dsn::metric_unit::kMilliSeconds, + "The max backup duration among backups"); + +METRIC_DEFINE_gauge_int64(replica, + backup_file_upload_max_bytes, + dsn::metric_unit::kBytes, + "The max size of uploaded files among backups"); + namespace dsn { namespace replication { @@ -83,7 +98,14 @@ static bool get_policy_checkpoint_dirs(const std::string &dir, return true; } -replica_backup_manager::replica_backup_manager(replica *r) : replica_base(r), _replica(r) {} +replica_backup_manager::replica_backup_manager(replica *r) + : replica_base(r), + _replica(r), + METRIC_VAR_INIT_replica(backup_running_count), + METRIC_VAR_INIT_replica(backup_max_duration_ms), + METRIC_VAR_INIT_replica(backup_file_upload_max_bytes) +{ +} replica_backup_manager::~replica_backup_manager() { @@ -160,9 +182,9 @@ void replica_backup_manager::collect_backup_info() } } - _replica->_cold_backup_running_count.store(cold_backup_running_count); - _replica->_cold_backup_max_duration_time_ms.store(cold_backup_max_duration_time_ms); - _replica->_cold_backup_max_upload_file_size.store(cold_backup_max_upload_file_size); + METRIC_VAR_SET(backup_running_count, cold_backup_running_count); + METRIC_VAR_SET(backup_max_duration_ms, cold_backup_max_duration_time_ms); + METRIC_VAR_SET(backup_file_upload_max_bytes, cold_backup_max_upload_file_size); } void replica_backup_manager::background_clear_backup_checkpoint(const std::string &policy_name) diff --git a/src/replica/backup/replica_backup_manager.h b/src/replica/backup/replica_backup_manager.h index 80c479d16..40d005537 100644 --- a/src/replica/backup/replica_backup_manager.h +++ b/src/replica/backup/replica_backup_manager.h @@ -21,6 +21,7 @@ #include "replica/replica_base.h" #include "runtime/task/task.h" +#include "utils/metrics.h" namespace dsn { class gpid; @@ -40,6 +41,9 @@ public: void start_collect_backup_info(); private: + friend class replica; + friend class replica_backup_manager_test; + void clear_backup_checkpoint(const std::string &policy_name); void send_clear_request_to_secondaries(const gpid &pid, const std::string &policy_name); void background_clear_backup_checkpoint(const std::string &policy_name); @@ -48,8 +52,9 @@ private: replica *_replica; dsn::task_ptr _collect_info_timer; - friend class replica; - friend class replica_backup_manager_test; + METRIC_VAR_DECLARE_gauge_int64(backup_running_count); + METRIC_VAR_DECLARE_gauge_int64(backup_max_duration_ms); + METRIC_VAR_DECLARE_gauge_int64(backup_file_upload_max_bytes); }; } // namespace replication diff --git a/src/replica/replica.cpp b/src/replica/replica.cpp index 9774b913b..fecd4c8a3 100644 --- a/src/replica/replica.cpp +++ b/src/replica/replica.cpp @@ -209,6 +209,41 @@ METRIC_DEFINE_counter(replica, dsn::metric_unit::kRequests, "The number of write requests whose size exceeds threshold"); +METRIC_DEFINE_counter(replica, + backup_started_count, + dsn::metric_unit::kBackups, + "The number of started backups"); + +METRIC_DEFINE_counter(replica, + backup_failed_count, + dsn::metric_unit::kBackups, + "The number of failed backups"); + +METRIC_DEFINE_counter(replica, + backup_successful_count, + dsn::metric_unit::kBackups, + "The number of successful backups"); + +METRIC_DEFINE_counter(replica, + backup_cancelled_count, + dsn::metric_unit::kBackups, + "The number of cancelled backups"); + +METRIC_DEFINE_counter(replica, + backup_file_upload_failed_count, + dsn::metric_unit::kFileUploads, + "The number of failed file uploads for backups"); + +METRIC_DEFINE_counter(replica, + backup_file_upload_successful_count, + dsn::metric_unit::kFileUploads, + "The number of successful file uploads for backups"); + +METRIC_DEFINE_counter(replica, + backup_file_upload_total_bytes, + dsn::metric_unit::kBytes, + "The total size of uploaded files for backups"); + namespace dsn { namespace replication { @@ -250,9 +285,6 @@ replica::replica(replica_stub *stub, _app_info(app), _primary_states(gpid, FLAGS_staleness_for_commit, FLAGS_batch_write_disabled), _potential_secondary_states(this), - _cold_backup_running_count(0), - _cold_backup_max_duration_time_ms(0), - _cold_backup_max_upload_file_size(0), _chkpt_total_size(0), _cur_download_size(0), _restore_progress(0), @@ -288,7 +320,14 @@ replica::replica(replica_stub *stub, METRIC_VAR_INIT_replica(prepare_failed_requests), METRIC_VAR_INIT_replica(group_check_failed_requests), METRIC_VAR_INIT_replica(emergency_checkpoints), - METRIC_VAR_INIT_replica(write_size_exceed_threshold_requests) + METRIC_VAR_INIT_replica(write_size_exceed_threshold_requests), + METRIC_VAR_INIT_replica(backup_started_count), + METRIC_VAR_INIT_replica(backup_failed_count), + METRIC_VAR_INIT_replica(backup_successful_count), + METRIC_VAR_INIT_replica(backup_cancelled_count), + METRIC_VAR_INIT_replica(backup_file_upload_failed_count), + METRIC_VAR_INIT_replica(backup_file_upload_successful_count), + METRIC_VAR_INIT_replica(backup_file_upload_total_bytes) { CHECK(!_app_info.app_type.empty(), ""); CHECK_NOTNULL(stub, ""); diff --git a/src/replica/replica.h b/src/replica/replica.h index 56e993cd8..afd58290c 100644 --- a/src/replica/replica.h +++ b/src/replica/replica.h @@ -294,6 +294,12 @@ public: METRIC_DEFINE_VALUE(write_size_exceed_threshold_requests, int64_t) void METRIC_FUNC_NAME_SET(dup_pending_mutations)(); + METRIC_DEFINE_INCREMENT(backup_failed_count) + METRIC_DEFINE_INCREMENT(backup_successful_count) + METRIC_DEFINE_INCREMENT(backup_cancelled_count) + METRIC_DEFINE_INCREMENT(backup_file_upload_failed_count) + METRIC_DEFINE_INCREMENT(backup_file_upload_successful_count) + METRIC_DEFINE_INCREMENT_BY(backup_file_upload_total_bytes) static const std::string kAppInfo; @@ -607,11 +613,6 @@ private: std::map<std::string, cold_backup_context_ptr> _cold_backup_contexts; partition_split_context _split_states; - // timer task that running in replication-thread - std::atomic<uint64_t> _cold_backup_running_count; - std::atomic<uint64_t> _cold_backup_max_duration_time_ms; - std::atomic<uint64_t> _cold_backup_max_upload_file_size; - // record the progress of restore int64_t _chkpt_total_size; std::atomic<int64_t> _cur_download_size; @@ -691,6 +692,14 @@ private: METRIC_VAR_DECLARE_counter(write_size_exceed_threshold_requests); + METRIC_VAR_DECLARE_counter(backup_started_count); + METRIC_VAR_DECLARE_counter(backup_failed_count); + METRIC_VAR_DECLARE_counter(backup_successful_count); + METRIC_VAR_DECLARE_counter(backup_cancelled_count); + METRIC_VAR_DECLARE_counter(backup_file_upload_failed_count); + METRIC_VAR_DECLARE_counter(backup_file_upload_successful_count); + METRIC_VAR_DECLARE_counter(backup_file_upload_total_bytes); + dsn::task_tracker _tracker; // the thread access checker dsn::thread_access_checker _checker; diff --git a/src/replica/replica_backup.cpp b/src/replica/replica_backup.cpp index 50a4b45f2..09c407929 100644 --- a/src/replica/replica_backup.cpp +++ b/src/replica/replica_backup.cpp @@ -42,8 +42,6 @@ #include "common/replication_other_types.h" #include "dsn.layer2_types.h" #include "metadata_types.h" -#include "perf_counter/perf_counter.h" -#include "perf_counter/perf_counter_wrapper.h" #include "replica.h" #include "replica/replica_context.h" #include "replica/replication_app_base.h" @@ -55,6 +53,7 @@ #include "utils/filesystem.h" #include "utils/flags.h" #include "utils/fmt_logging.h" +#include "utils/metrics.h" #include "utils/strings.h" #include "utils/thread_access_checker.h" #include "utils/time_utils.h" @@ -175,7 +174,7 @@ void replica::on_cold_backup(const backup_request &request, /*out*/ backup_respo backup_context->start_check(); backup_context->complete_check(false); if (backup_context->start_checkpoint()) { - _stub->_counter_cold_backup_recent_start_count->increment(); + METRIC_VAR_INCREMENT(backup_started_count); tasking::enqueue( LPC_BACKGROUND_COLD_BACKUP, &_tracker, [this, backup_context]() { generate_backup_checkpoint(backup_context); @@ -196,7 +195,7 @@ void replica::on_cold_backup(const backup_request &request, /*out*/ backup_respo backup_context->progress()); response.err = ERR_BUSY; } else if (backup_status == ColdBackupInvalid && backup_context->start_check()) { - _stub->_counter_cold_backup_recent_start_count->increment(); + METRIC_VAR_INCREMENT(backup_started_count); LOG_INFO("{}: start checking backup on remote, response ERR_BUSY", backup_context->name); tasking::enqueue(LPC_BACKGROUND_COLD_BACKUP, nullptr, [backup_context]() { diff --git a/src/replica/replica_stub.cpp b/src/replica/replica_stub.cpp index dd10102f6..bb7f05470 100644 --- a/src/replica/replica_stub.cpp +++ b/src/replica/replica_stub.cpp @@ -164,10 +164,10 @@ METRIC_DEFINE_gauge_int64(server, "The number of origin replica dirs (*.ori) for disk migration"); #ifdef DSN_ENABLE_GPERF -METRIC_DEFINE_gauge_int64(server, - tcmalloc_released_bytes, - dsn::metric_unit::kBytes, - "The memory bytes that are released by tcmalloc recently"); +METRIC_DEFINE_counter(server, + tcmalloc_released_bytes, + dsn::metric_unit::kBytes, + "The memory bytes that are released accumulatively by tcmalloc"); #endif METRIC_DEFINE_counter(server, @@ -333,63 +333,6 @@ replica_stub::~replica_stub(void) { close(); } void replica_stub::install_perf_counters() { - // <- Cold Backup Metrics -> - - _counter_cold_backup_running_count.init_app_counter("eon.replica_stub", - "cold.backup.running.count", - COUNTER_TYPE_NUMBER, - "current cold backup count"); - _counter_cold_backup_recent_start_count.init_app_counter( - "eon.replica_stub", - "cold.backup.recent.start.count", - COUNTER_TYPE_VOLATILE_NUMBER, - "current cold backup start count in the recent period"); - _counter_cold_backup_recent_succ_count.init_app_counter( - "eon.replica_stub", - "cold.backup.recent.succ.count", - COUNTER_TYPE_VOLATILE_NUMBER, - "current cold backup succeed count in the recent period"); - _counter_cold_backup_recent_fail_count.init_app_counter( - "eon.replica_stub", - "cold.backup.recent.fail.count", - COUNTER_TYPE_VOLATILE_NUMBER, - "current cold backup fail count in the recent period"); - _counter_cold_backup_recent_cancel_count.init_app_counter( - "eon.replica_stub", - "cold.backup.recent.cancel.count", - COUNTER_TYPE_VOLATILE_NUMBER, - "current cold backup cancel count in the recent period"); - _counter_cold_backup_recent_pause_count.init_app_counter( - "eon.replica_stub", - "cold.backup.recent.pause.count", - COUNTER_TYPE_VOLATILE_NUMBER, - "current cold backup pause count in the recent period"); - _counter_cold_backup_recent_upload_file_succ_count.init_app_counter( - "eon.replica_stub", - "cold.backup.recent.upload.file.succ.count", - COUNTER_TYPE_VOLATILE_NUMBER, - "current cold backup upload file succeed count in the recent period"); - _counter_cold_backup_recent_upload_file_fail_count.init_app_counter( - "eon.replica_stub", - "cold.backup.recent.upload.file.fail.count", - COUNTER_TYPE_VOLATILE_NUMBER, - "current cold backup upload file failed count in the recent period"); - _counter_cold_backup_recent_upload_file_size.init_app_counter( - "eon.replica_stub", - "cold.backup.recent.upload.file.size", - COUNTER_TYPE_VOLATILE_NUMBER, - "current cold backup upload file size in the recent perriod"); - _counter_cold_backup_max_duration_time_ms.init_app_counter( - "eon.replica_stub", - "cold.backup.max.duration.time.ms", - COUNTER_TYPE_NUMBER, - "current cold backup max duration time"); - _counter_cold_backup_max_upload_file_size.init_app_counter( - "eon.replica_stub", - "cold.backup.max.upload.file.size", - COUNTER_TYPE_NUMBER, - "current cold backup max upload file size"); - // <- Bulk Load Metrics -> _counter_bulk_load_running_count.init_app_counter("eon.replica_stub", @@ -1827,9 +1770,6 @@ void replica_stub::on_gc() uint64_t learning_count = 0; uint64_t learning_max_duration_time_ms = 0; uint64_t learning_max_copy_file_size = 0; - uint64_t cold_backup_running_count = 0; - uint64_t cold_backup_max_duration_time_ms = 0; - uint64_t cold_backup_max_upload_file_size = 0; uint64_t bulk_load_running_count = 0; uint64_t bulk_load_max_ingestion_time_ms = 0; uint64_t bulk_load_max_duration_time_ms = 0; @@ -1849,12 +1789,6 @@ void replica_stub::on_gc() } if (rep->status() == partition_status::PS_PRIMARY || rep->status() == partition_status::PS_SECONDARY) { - cold_backup_running_count += rep->_cold_backup_running_count.load(); - cold_backup_max_duration_time_ms = std::max( - cold_backup_max_duration_time_ms, rep->_cold_backup_max_duration_time_ms.load()); - cold_backup_max_upload_file_size = std::max( - cold_backup_max_upload_file_size, rep->_cold_backup_max_upload_file_size.load()); - if (rep->get_bulk_loader()->get_bulk_load_status() != bulk_load_status::BLS_INVALID) { bulk_load_running_count++; bulk_load_max_ingestion_time_ms = @@ -1878,9 +1812,6 @@ void replica_stub::on_gc() METRIC_VAR_SET(learning_replicas, learning_count); METRIC_VAR_SET(learning_replicas_max_duration_ms, learning_max_duration_time_ms); METRIC_VAR_SET(learning_replicas_max_copy_file_bytes, learning_max_copy_file_size); - _counter_cold_backup_running_count->set(cold_backup_running_count); - _counter_cold_backup_max_duration_time_ms->set(cold_backup_max_duration_time_ms); - _counter_cold_backup_max_upload_file_size->set(cold_backup_max_upload_file_size); _counter_bulk_load_running_count->set(bulk_load_running_count); _counter_bulk_load_max_ingestion_time_ms->set(bulk_load_max_ingestion_time_ms); _counter_bulk_load_max_duration_time_ms->set(bulk_load_max_duration_time_ms); @@ -2797,43 +2728,48 @@ static int64_t get_tcmalloc_numeric_property(const char *prop) uint64_t replica_stub::gc_tcmalloc_memory(bool release_all) { - auto tcmalloc_released_bytes = 0; if (!_release_tcmalloc_memory) { _is_releasing_memory.store(false); - METRIC_VAR_SET(tcmalloc_released_bytes, tcmalloc_released_bytes); - return tcmalloc_released_bytes; + return 0; } if (_is_releasing_memory.load()) { LOG_WARNING("This node is releasing memory..."); - return tcmalloc_released_bytes; + return 0; } _is_releasing_memory.store(true); + int64_t total_allocated_bytes = get_tcmalloc_numeric_property("generic.current_allocated_bytes"); int64_t reserved_bytes = get_tcmalloc_numeric_property("tcmalloc.pageheap_free_bytes"); if (total_allocated_bytes == -1 || reserved_bytes == -1) { - return tcmalloc_released_bytes; + return 0; } int64_t max_reserved_bytes = release_all ? 0 : (total_allocated_bytes * _mem_release_max_reserved_mem_percentage / 100.0); - if (reserved_bytes > max_reserved_bytes) { - int64_t release_bytes = reserved_bytes - max_reserved_bytes; - tcmalloc_released_bytes = release_bytes; - LOG_INFO("Memory release started, almost {} bytes will be released", release_bytes); - while (release_bytes > 0) { - // tcmalloc releasing memory will lock page heap, release 1MB at a time to avoid locking - // page heap for long time - ::MallocExtension::instance()->ReleaseToSystem(1024 * 1024); - release_bytes -= 1024 * 1024; - } + if (reserved_bytes <= max_reserved_bytes) { + return 0; + } + + const int64_t expected_released_bytes = reserved_bytes - max_reserved_bytes; + LOG_INFO("Memory release started, almost {} bytes will be released", expected_released_bytes); + + int64_t unreleased_bytes = expected_released_bytes; + while (unreleased_bytes > 0) { + // tcmalloc releasing memory will lock page heap, release 1MB at a time to avoid locking + // page heap for long time + static const int64_t kReleasedBytesEachTime = 1024 * 1024; + ::MallocExtension::instance()->ReleaseToSystem(kReleasedBytesEachTime); + unreleased_bytes -= kReleasedBytesEachTime; } - METRIC_VAR_SET(tcmalloc_released_bytes, tcmalloc_released_bytes); + METRIC_VAR_INCREMENT_BY(tcmalloc_released_bytes, expected_released_bytes); + _is_releasing_memory.store(false); - return tcmalloc_released_bytes; + + return expected_released_bytes; } #endif diff --git a/src/replica/replica_stub.h b/src/replica/replica_stub.h index 4cc961e38..5f39d49b0 100644 --- a/src/replica/replica_stub.h +++ b/src/replica/replica_stub.h @@ -512,7 +512,7 @@ private: METRIC_VAR_DECLARE_gauge_int64(replica_origin_dirs); #ifdef DSN_ENABLE_GPERF - METRIC_VAR_DECLARE_gauge_int64(tcmalloc_released_bytes); + METRIC_VAR_DECLARE_counter(tcmalloc_released_bytes); #endif METRIC_VAR_DECLARE_counter(read_failed_requests); @@ -520,18 +520,6 @@ private: METRIC_VAR_DECLARE_counter(read_busy_requests); METRIC_VAR_DECLARE_counter(write_busy_requests); - perf_counter_wrapper _counter_cold_backup_running_count; - perf_counter_wrapper _counter_cold_backup_recent_start_count; - perf_counter_wrapper _counter_cold_backup_recent_succ_count; - perf_counter_wrapper _counter_cold_backup_recent_fail_count; - perf_counter_wrapper _counter_cold_backup_recent_cancel_count; - perf_counter_wrapper _counter_cold_backup_recent_pause_count; - perf_counter_wrapper _counter_cold_backup_recent_upload_file_succ_count; - perf_counter_wrapper _counter_cold_backup_recent_upload_file_fail_count; - perf_counter_wrapper _counter_cold_backup_recent_upload_file_size; - perf_counter_wrapper _counter_cold_backup_max_duration_time_ms; - perf_counter_wrapper _counter_cold_backup_max_upload_file_size; - // <- Bulk load Metrics -> perf_counter_wrapper _counter_bulk_load_running_count; perf_counter_wrapper _counter_bulk_load_downloading_count; diff --git a/src/utils/metrics.h b/src/utils/metrics.h index e90d5d9af..78c067ea3 100644 --- a/src/utils/metrics.h +++ b/src/utils/metrics.h @@ -684,6 +684,8 @@ enum class metric_unit : size_t kLearns, kRounds, kResets, + kBackups, + kFileUploads, kInvalidUnit, }; --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
