(incubator-pegasus) branch master updated: feat(new_metrics): show server-level resource usage by shell nodes command based on new metrics (#1884)

wangdan Wed, 31 Jan 2024 23:50:50 -0800

This is an automated email from the ASF dual-hosted git repository.

wangdan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-pegasus.git



The following commit(s) were added to refs/heads/master by this push:
     new 40c00dbb0 feat(new_metrics): show server-level resource usage by shell 
nodes command based on new metrics (#1884)
40c00dbb0 is described below

commit 40c00dbb03554fe95d5e120e440c729915f9c8e0
Author: Dan Wang <[email protected]>
AuthorDate: Thu Feb 1 15:50:40 2024 +0800

    feat(new_metrics): show server-level resource usage by shell nodes command 
based on new metrics (#1884)
---
 src/shell/commands/node_management.cpp  | 113 ++++++++++++++++++++++----------
 src/shell/commands/table_management.cpp |   2 +-
 2 files changed, 79 insertions(+), 36 deletions(-)

diff --git a/src/shell/commands/node_management.cpp 
b/src/shell/commands/node_management.cpp
index fd9369b5c..9a30ba88f 100644
--- a/src/shell/commands/node_management.cpp
+++ b/src/shell/commands/node_management.cpp
@@ -35,6 +35,7 @@
 #include "common/json_helper.h"
 #include "common/replication_enums.h"
 #include "dsn.layer2_types.h"
+#include "http/http_status_code.h"
 #include "meta_admin_types.h"
 #include "perf_counter/perf_counter_utils.h"
 #include "runtime/rpc/rpc_address.h"
@@ -45,7 +46,10 @@
 #include "shell/sds/sds.h"
 #include "utils/blob.h"
 #include "utils/error_code.h"
+#include "utils/errors.h"
+#include "utils/metrics.h"
 #include "utils/output_utils.h"
+#include "utils/ports.h"
 #include "utils/strings.h"
 #include "utils/utils.h"
 
@@ -88,6 +92,45 @@ bool query_cluster_info(command_executor *e, shell_context 
*sc, arguments args)
     return true;
 }
 
+namespace {
+
+dsn::metric_filters resource_usage_filters()
+{
+    dsn::metric_filters filters;
+    filters.with_metric_fields = {dsn::kMetricNameField, 
dsn::kMetricSingleValueField};
+    filters.entity_types = {"server"};
+    filters.entity_metrics = {"resident_mem_usage_mb", 
"rdb_block_cache_mem_usage_bytes"};
+    return filters;
+}
+
+dsn::error_s parse_resource_usage(const std::string &json_string, 
list_nodes_helper &stat)
+{
+    dsn::error_s err;
+
+    dsn::metric_query_brief_value_snapshot query_snapshot;
+    dsn::blob bb(json_string.data(), 0, json_string.size());
+    if 
(dsn_unlikely(!dsn::json::json_forwarder<dsn::metric_query_brief_value_snapshot>::decode(
+            bb, query_snapshot))) {
+        return FMT_ERR(dsn::ERR_INVALID_DATA, "invalid json string");
+    }
+
+    for (const auto &entity : query_snapshot.entities) {
+        if (entity.type == "server") {
+            for (const auto &m : entity.metrics) {
+                if (m.name == "resident_mem_usage_mb") {
+                    stat.memused_res_mb += m.value;
+                } else if (m.name == "rdb_block_cache_mem_usage_bytes") {
+                    stat.block_cache_bytes += m.value;
+                }
+            }
+        }
+    }
+
+    return dsn::error_s::ok();
+}
+
+} // anonymous namespace
+
 bool ls_nodes(command_executor *e, shell_context *sc, arguments args)
 {
     static struct option long_options[] = {{"detailed", no_argument, 0, 'd'},
@@ -227,54 +270,54 @@ bool ls_nodes(command_executor *e, shell_context *sc, 
arguments args)
             return true;
         }
 
-        std::vector<std::pair<bool, std::string>> results =
-            call_remote_command(sc,
-                                nodes,
-                                "perf-counters-by-prefix",
-                                {"replica*server*memused.res(MB)",
-                                 
"replica*app.pegasus*rdb.block_cache.memory_usage",
-                                 
"replica*eon.replica_stub*disk.available.total.ratio",
-                                 
"replica*eon.replica_stub*disk.available.min.ratio",
-                                 
"replica*app.pegasus*rdb.memtable.memory_usage",
-                                 
"replica*app.pegasus*rdb.index_and_filter_blocks.memory_usage"});
+        const auto &results = get_metrics(nodes, 
resource_usage_filters().to_query_string());
 
-        for (int i = 0; i < nodes.size(); ++i) {
-            dsn::rpc_address node_addr = nodes[i].address;
-            auto tmp_it = tmp_map.find(node_addr);
-            if (tmp_it == tmp_map.end())
+        // TODO(wangdan): following replica-level and disk-level metrics would 
be replaced:
+        // "replica*eon.replica_stub*disk.available.total.ratio"
+        // "replica*eon.replica_stub*disk.available.min.ratio"
+        // "replica*app.pegasus*rdb.memtable.memory_usage"
+        // "replica*app.pegasus*rdb.index_and_filter_blocks.memory_usage"
+
+        for (size_t i = 0; i < nodes.size(); ++i) {
+            auto tmp_it = tmp_map.find(nodes[i].address);
+            if (tmp_it == tmp_map.end()) {
                 continue;
-            if (!results[i].first) {
-                std::cout << "query perf counter info from node " << 
node_addr.to_string()
-                          << " failed" << std::endl;
+            }
+
+            if (!results[i].error()) {
+                std::cout << "ERROR: send http request to query resource 
metrics from node "
+                          << nodes[i].address << " failed: " << 
results[i].error() << std::endl;
                 return true;
             }
-            dsn::perf_counter_info info;
-            dsn::blob bb(results[i].second.data(), 0, 
results[i].second.size());
-            if (!dsn::json::json_forwarder<dsn::perf_counter_info>::decode(bb, 
info)) {
-                std::cout << "decode perf counter info from node " << 
node_addr.to_string()
-                          << " failed, result = " << results[i].second << 
std::endl;
+            if (results[i].status() != dsn::http_status_code::kOk) {
+                std::cout << "ERROR: send http request to query resource 
metrics from node "
+                          << nodes[i].address
+                          << " failed: " << 
dsn::get_http_status_message(results[i].status())
+                          << std::endl
+                          << results[i].body() << std::endl;
                 return true;
             }
-            if (info.result != "OK") {
-                std::cout << "query perf counter info from node " << 
node_addr.to_string()
-                          << " returns error, error = " << info.result << 
std::endl;
+
+            auto &stat = tmp_it->second;
+            const auto &res = parse_resource_usage(results[i].body(), stat);
+            if (!res) {
+                std::cout << "ERROR: parse sst metrics response from node " << 
nodes[i].address
+                          << " failed: " << res << std::endl;
                 return true;
             }
-            list_nodes_helper &h = tmp_it->second;
+
+            // TODO(wangdan): after migrated to new metrics, remove following 
code:
+            dsn::perf_counter_info info;
             for (dsn::perf_counter_metric &m : info.counters) {
-                if (m.name.find("memused.res(MB)") != std::string::npos)
-                    h.memused_res_mb += m.value;
-                else if (m.name.find("rdb.block_cache.memory_usage") != 
std::string::npos)
-                    h.block_cache_bytes += m.value;
-                else if (m.name.find("disk.available.total.ratio") != 
std::string::npos)
-                    h.disk_available_total_ratio += m.value;
+                if (m.name.find("disk.available.total.ratio") != 
std::string::npos)
+                    stat.disk_available_total_ratio += m.value;
                 else if (m.name.find("disk.available.min.ratio") != 
std::string::npos)
-                    h.disk_available_min_ratio += m.value;
+                    stat.disk_available_min_ratio += m.value;
                 else if (m.name.find("rdb.memtable.memory_usage") != 
std::string::npos)
-                    h.mem_tbl_bytes += m.value;
+                    stat.mem_tbl_bytes += m.value;
                 else if 
(m.name.find("rdb.index_and_filter_blocks.memory_usage") !=
                          std::string::npos)
-                    h.mem_idx_bytes += m.value;
+                    stat.mem_idx_bytes += m.value;
             }
         }
     }
diff --git a/src/shell/commands/table_management.cpp 
b/src/shell/commands/table_management.cpp
index dbb01ac8b..46955e84f 100644
--- a/src/shell/commands/table_management.cpp
+++ b/src/shell/commands/table_management.cpp
@@ -323,7 +323,7 @@ bool app_disk(command_executor *e, shell_context *sc, 
arguments args)
 
     std::map<dsn::rpc_address, std::map<int32_t, double>> disk_map;
     std::map<dsn::rpc_address, std::map<int32_t, double>> count_map;
-    for (int i = 0; i < nodes.size(); ++i) {
+    for (size_t i = 0; i < nodes.size(); ++i) {
         if (!results[i].error()) {
             std::cout << "ERROR: send http request to query sst metrics from 
node "
                       << nodes[i].address << " failed: " << results[i].error() 
<< std::endl;


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(incubator-pegasus) branch master updated: feat(new_metrics): show server-level resource usage by shell nodes command based on new metrics (#1884)

Reply via email to