This is an automated email from the ASF dual-hosted git repository. wangdan pushed a commit to branch migrate-metrics-dev in repository https://gitbox.apache.org/repos/asf/incubator-pegasus.git
commit 8bb759f3695eb1a1fe1eb2336f4aeaaf32ffd502 Author: Dan Wang <[email protected]> AuthorDate: Thu Apr 6 11:41:34 2023 +0800 feat(new_metrics): add disk-level metric entity and migrate disk-level metrics for fs_manager (#1427) https://github.com/apache/incubator-pegasus/issues/1425 In perf counters, all metrics of `fs_manager` are server-level. For example, the total capacity and the available capacity of all disks where there are data of pegasus. However, sometimes the capacity and the available capacity of each disk seem more important: no space left on the disk will lead to serious problems. Therefore, after being migrated to new framework, the server-level metrics of perf counters become disk-level, including the capacity and the available capacity of a disk. As for another disk-level metric -- the available percentage of each disk used by a replica server, just use division operator. Once server-level metrics are needed, just aggregate on the disk-level ones. To compute another 2 server-level metrics -- the minimal/maximal available percentage among all disks used by a replica server in a node, for example, just use min/max operators over disk-level ones for Prometheus. To implement disk-level metrics, disk-level metric entity are also added. --- src/common/fs_manager.cpp | 93 +++++++++++++++++++------------ src/common/fs_manager.h | 57 +++++++++++-------- src/common/test/CMakeLists.txt | 1 + src/replica/test/replica_disk_test_base.h | 1 - src/utils/metrics.h | 7 +++ 5 files changed, 98 insertions(+), 61 deletions(-) diff --git a/src/common/fs_manager.cpp b/src/common/fs_manager.cpp index 6fed98132..af77f6814 100644 --- a/src/common/fs_manager.cpp +++ b/src/common/fs_manager.cpp @@ -44,7 +44,6 @@ #include "common/replication_enums.h" #include "fmt/core.h" #include "fmt/ostream.h" -#include "perf_counter/perf_counter.h" #include "replica_admin_types.h" #include "runtime/api_layer1.h" #include "utils/fail_point.h" @@ -53,6 +52,18 @@ #include "utils/ports.h" #include "utils/string_view.h" +METRIC_DEFINE_entity(disk); + +METRIC_DEFINE_gauge_int64(disk, + disk_capacity_total_mb, + dsn::metric_unit::kMegaBytes, + "The total disk capacity"); + +METRIC_DEFINE_gauge_int64(disk, + disk_capacity_avail_mb, + dsn::metric_unit::kMegaBytes, + "The available disk capacity"); + namespace dsn { namespace replication { @@ -83,6 +94,34 @@ error_code disk_status_to_error_code(disk_status::type ds) } } +namespace { + +metric_entity_ptr instantiate_disk_metric_entity(const std::string &tag, + const std::string &data_dir) +{ + auto entity_id = fmt::format("disk_{}", tag); + + return METRIC_ENTITY_disk.instantiate(entity_id, {{"tag", tag}, {"data_dir", data_dir}}); +} + +} // anonymous namespace + +disk_capacity_metrics::disk_capacity_metrics(const std::string &tag, const std::string &data_dir) + : _disk_metric_entity(instantiate_disk_metric_entity(tag, data_dir)), + METRIC_VAR_INIT_disk(disk_capacity_total_mb), + METRIC_VAR_INIT_disk(disk_capacity_avail_mb) +{ +} + +const metric_entity_ptr &disk_capacity_metrics::disk_metric_entity() const +{ + CHECK_NOTNULL(_disk_metric_entity, + "disk metric entity should has been instantiated: " + "uninitialized entity cannot be used to instantiate " + "metric"); + return _disk_metric_entity; +} + uint64_t dir_node::replicas_count() const { uint64_t sum = 0; @@ -141,6 +180,9 @@ void dir_node::update_disk_stat() disk_available_ratio = static_cast<int>( disk_capacity_mb == 0 ? 0 : std::round(disk_available_mb * 100.0 / disk_capacity_mb)); + METRIC_CALL_SET_METHOD(disk_capacity, disk_capacity_total_mb, disk_capacity_mb); + METRIC_CALL_SET_METHOD(disk_capacity, disk_capacity_avail_mb, disk_available_mb); + // It's able to change status from NORMAL to SPACE_INSUFFICIENT, and vice versa. disk_status::type old_status = status; auto new_status = disk_available_ratio < FLAGS_disk_min_available_space_ratio @@ -158,30 +200,6 @@ void dir_node::update_disk_stat() enum_to_string(status)); } -fs_manager::fs_manager() -{ - _counter_total_capacity_mb.init_app_counter("eon.replica_stub", - "disk.capacity.total(MB)", - COUNTER_TYPE_NUMBER, - "total disk capacity in MB"); - _counter_total_available_mb.init_app_counter("eon.replica_stub", - "disk.available.total(MB)", - COUNTER_TYPE_NUMBER, - "total disk available in MB"); - _counter_total_available_ratio.init_app_counter("eon.replica_stub", - "disk.available.total.ratio", - COUNTER_TYPE_NUMBER, - "total disk available ratio"); - _counter_min_available_ratio.init_app_counter("eon.replica_stub", - "disk.available.min.ratio", - COUNTER_TYPE_NUMBER, - "minimal disk available ratio in all disks"); - _counter_max_available_ratio.init_app_counter("eon.replica_stub", - "disk.available.max.ratio", - COUNTER_TYPE_NUMBER, - "maximal disk available ratio in all disks"); -} - dir_node *fs_manager::get_dir_node(const std::string &subdir) const { std::string norm_subdir; @@ -355,8 +373,14 @@ void fs_manager::remove_replica(const gpid &pid) void fs_manager::update_disk_stat() { + _total_capacity_mb = 0; + _total_available_mb = 0; + int total_available_ratio = 0; + int min_available_ratio = 100; + int max_available_ratio = 0; + zauto_write_lock l(_lock); - reset_disk_stat(); + for (auto &dn : _dir_nodes) { // If the disk is already in IO_ERROR status, it will not change to other status, just skip // it. @@ -368,10 +392,10 @@ void fs_manager::update_disk_stat() dn->update_disk_stat(); _total_capacity_mb += dn->disk_capacity_mb; _total_available_mb += dn->disk_available_mb; - _min_available_ratio = std::min(dn->disk_available_ratio, _min_available_ratio); - _max_available_ratio = std::max(dn->disk_available_ratio, _max_available_ratio); + min_available_ratio = std::min(dn->disk_available_ratio, min_available_ratio); + max_available_ratio = std::max(dn->disk_available_ratio, max_available_ratio); } - _total_available_ratio = static_cast<int>( + total_available_ratio = static_cast<int>( _total_capacity_mb == 0 ? 0 : std::round(_total_available_mb * 100.0 / _total_capacity_mb)); LOG_INFO("update disk space succeed: disk_count = {}, total_capacity_mb = {}, " @@ -380,14 +404,9 @@ void fs_manager::update_disk_stat() _dir_nodes.size(), _total_capacity_mb, _total_available_mb, - _total_available_ratio, - _min_available_ratio, - _max_available_ratio); - _counter_total_capacity_mb->set(_total_capacity_mb); - _counter_total_available_mb->set(_total_available_mb); - _counter_total_available_ratio->set(_total_available_ratio); - _counter_min_available_ratio->set(_min_available_ratio); - _counter_max_available_ratio->set(_max_available_ratio); + total_available_ratio, + min_available_ratio, + max_available_ratio); } void fs_manager::add_new_dir_node(const std::string &data_dir, const std::string &tag) diff --git a/src/common/fs_manager.h b/src/common/fs_manager.h index 65fb0243a..dd22a6953 100644 --- a/src/common/fs_manager.h +++ b/src/common/fs_manager.h @@ -28,10 +28,12 @@ #include "common/replication_other_types.h" #include "metadata_types.h" -#include "perf_counter/perf_counter_wrapper.h" +#include "utils/autoref_ptr.h" #include "utils/error_code.h" #include "utils/flags.h" #include "utils/string_view.h" +#include "utils/metrics.h" +#include "utils/ports.h" #include "utils/zlocks.h" namespace dsn { @@ -44,6 +46,25 @@ DSN_DECLARE_int32(disk_min_available_space_ratio); error_code disk_status_to_error_code(disk_status::type ds); +class disk_capacity_metrics +{ +public: + disk_capacity_metrics(const std::string &tag, const std::string &data_dir); + ~disk_capacity_metrics() = default; + + const metric_entity_ptr &disk_metric_entity() const; + + METRIC_DEFINE_SET_METHOD(disk_capacity_total_mb, int64_t) + METRIC_DEFINE_SET_METHOD(disk_capacity_avail_mb, int64_t) + +private: + const metric_entity_ptr _disk_metric_entity; + METRIC_VAR_DECLARE_gauge_int64(disk_capacity_total_mb); + METRIC_VAR_DECLARE_gauge_int64(disk_capacity_avail_mb); + + DISALLOW_COPY_AND_ASSIGN(disk_capacity_metrics); +}; + struct dir_node { public: @@ -57,6 +78,9 @@ public: std::map<app_id, std::set<gpid>> holding_primary_replicas; std::map<app_id, std::set<gpid>> holding_secondary_replicas; +private: + disk_capacity_metrics disk_capacity; + public: dir_node(const std::string &tag_, const std::string &dir_, @@ -69,7 +93,8 @@ public: disk_capacity_mb(disk_capacity_mb_), disk_available_mb(disk_available_mb_), disk_available_ratio(disk_available_ratio_), - status(status_) + status(status_), + disk_capacity(tag_, dir_) { } // All functions are not thread-safe. However, they are only used in fs_manager @@ -87,7 +112,8 @@ public: class fs_manager { public: - fs_manager(); + fs_manager() = default; + ~fs_manager() = default; // Should be called before open/load any replicas. // NOTE: 'data_dirs' and 'data_dir_tags' must have the same size and in the same order. @@ -135,37 +161,22 @@ public: std::vector<disk_info> get_disk_infos(int app_id) const; private: - void reset_disk_stat() - { - _total_capacity_mb = 0; - _total_available_mb = 0; - _total_available_ratio = 0; - _min_available_ratio = 100; - _max_available_ratio = 0; - } - dir_node *get_dir_node(const std::string &subdir) const; - // when visit the tag/storage of the _dir_nodes map, there's no need to protect by the lock. - // but when visit the holding_replicas, you must take care. + // TODO(wangdan): _dir_nodes should be protected by lock since add_new_disk are supported: + // it might be updated arbitrarily at any time. + // + // Especially when visiting the holding_replicas, you must take care. mutable zrwlock_nr _lock; // [ lock + int64_t _total_capacity_mb = 0; int64_t _total_available_mb = 0; - int _total_available_ratio = 0; - int _min_available_ratio = 100; - int _max_available_ratio = 0; // Once dir_node has been added to '_dir_nodes', it will not be removed, it will be marked // as non-NORMAL status if it is not available. std::vector<std::shared_ptr<dir_node>> _dir_nodes; // ] end of lock - perf_counter_wrapper _counter_total_capacity_mb; - perf_counter_wrapper _counter_total_available_mb; - perf_counter_wrapper _counter_total_available_ratio; - perf_counter_wrapper _counter_min_available_ratio; - perf_counter_wrapper _counter_max_available_ratio; - friend class replica_test; friend class replica_stub; friend class mock_replica_stub; diff --git a/src/common/test/CMakeLists.txt b/src/common/test/CMakeLists.txt index 78d94000c..74a9cdf3e 100644 --- a/src/common/test/CMakeLists.txt +++ b/src/common/test/CMakeLists.txt @@ -27,6 +27,7 @@ set(MY_PROJ_NAME dsn_replication_common_test) set(MY_SRC_SEARCH_MODE "GLOB") set(MY_PROJ_LIBS + dsn_http dsn_replication_common dsn_runtime gtest diff --git a/src/replica/test/replica_disk_test_base.h b/src/replica/test/replica_disk_test_base.h index 08e7ffc39..e20883cd7 100644 --- a/src/replica/test/replica_disk_test_base.h +++ b/src/replica/test/replica_disk_test_base.h @@ -64,7 +64,6 @@ public: generate_mock_app_info(); stub->_fs_manager._dir_nodes.clear(); - stub->_fs_manager.reset_disk_stat(); generate_mock_dir_nodes(dir_nodes_count); generate_mock_empty_dir_node(empty_dir_nodes_count); diff --git a/src/utils/metrics.h b/src/utils/metrics.h index e69268006..a230aa1f9 100644 --- a/src/utils/metrics.h +++ b/src/utils/metrics.h @@ -165,6 +165,7 @@ class error_code; _##name(METRIC_##name.instantiate(entity##_metric_entity(), ##__VA_ARGS__)) #define METRIC_VAR_INIT_replica(name, ...) METRIC_VAR_INIT(name, replica, ##__VA_ARGS__) #define METRIC_VAR_INIT_server(name, ...) METRIC_VAR_INIT(name, server, ##__VA_ARGS__) +#define METRIC_VAR_INIT_disk(name, ...) METRIC_VAR_INIT(name, disk, ##__VA_ARGS__) // Perform increment-related operations on metrics including gauge and counter. #define METRIC_VAR_INCREMENT_BY(name, x) \ @@ -194,6 +195,11 @@ class error_code; #define METRIC_VAR_AUTO_LATENCY_DURATION_NS(name) __##name##_auto_latency.duration_ns() +#define METRIC_DEFINE_SET_METHOD(name, value_type) \ + void set_##name(value_type value) { METRIC_VAR_SET(name, value); } + +#define METRIC_CALL_SET_METHOD(obj, name, value) obj.set_##name(value) + namespace dsn { class metric; // IWYU pragma: keep class metric_entity_prototype; // IWYU pragma: keep @@ -614,6 +620,7 @@ enum class metric_unit : size_t kBytes, kMegaBytes, kCapacityUnits, + kPercent, kRequests, kSeeks, kPointLookups, --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
