This is an automated email from the ASF dual-hosted git repository.
wangdan pushed a commit to branch migrate-metrics-dev
in repository https://gitbox.apache.org/repos/asf/incubator-pegasus.git
The following commit(s) were added to refs/heads/migrate-metrics-dev by this
push:
new 9c6678515 feat(new_metrics): migrate metrics for latency tracer (#1537)
9c6678515 is described below
commit 9c6678515b6c1f8f45411bd790e95e5f2388ed05
Author: Dan Wang <[email protected]>
AuthorDate: Wed Jun 21 11:27:43 2023 +0800
feat(new_metrics): migrate metrics for latency tracer (#1537)
https://github.com/apache/incubator-pegasus/issues/1536
The only metric of latency tracer, namely the duration between two points
(stages), is migrated to the new framework:
- latency tracer entity is introduced as the new metric entity to which the
metric is attached;
- create new class that bind each entity and manage the metric;
- create new class that manage all entity instances, with a read-write lock
that helps improving the performance of search for the target entity;
- solve the problem of "No space left on device" for Build ASAN temporarily.
---
.github/workflows/lint_and_test_cpp.yaml | 86 ++++++++++++++-
src/common/fs_manager.cpp | 2 +-
src/meta/meta_backup_service.cpp | 2 +-
src/meta/table_metrics.cpp | 4 +-
src/replica/replica_base.cpp | 2 +-
src/runtime/profiler.cpp | 2 +-
src/runtime/task/task_queue.cpp | 2 +-
src/utils/latency_tracer.cpp | 178 +++++++++++++++++++++++--------
src/utils/latency_tracer.h | 3 -
src/utils/metrics.h | 2 +
10 files changed, 223 insertions(+), 60 deletions(-)
diff --git a/.github/workflows/lint_and_test_cpp.yaml
b/.github/workflows/lint_and_test_cpp.yaml
index 34e8da5b4..143f848e9 100644
--- a/.github/workflows/lint_and_test_cpp.yaml
+++ b/.github/workflows/lint_and_test_cpp.yaml
@@ -127,6 +127,7 @@ jobs:
- name: Unpack prebuilt third-parties
if: steps.changes.outputs.thirdparty == 'false'
run: |
+ df -h
unzip /root/thirdparties-bin.zip -d ./thirdparty
rm -f /root/thirdparties-bin.zip
- name: Rebuild third-parties
@@ -134,6 +135,7 @@ jobs:
working-directory: thirdparty
# Build thirdparties and leave some necessary libraries and source
run: |
+ df -h
mkdir build
cmake -DCMAKE_BUILD_TYPE=Release -DROCKSDB_PORTABLE=ON -B build/
cmake --build build/ -j $(nproc)
@@ -142,25 +144,33 @@ jobs:
../scripts/download_zk.sh zookeeper-bin
- name: Compilation
run: |
+ df -h
ccache -p
ccache -z
./run.sh build --test --skip_thirdparty -j $(nproc) -t release
ccache -s
+ - name: Clear Build Files
+ run: |
+ df -h
+ find ./build/latest/src/ -name '*CMakeFiles*' -type d -exec rm -rf
"{}" +
- name: Pack Server
run: |
+ df -h
./run.sh pack_server
rm -rf pegasus-server-*
- name: Pack Tools
run: |
+ df -h
./run.sh pack_tools
rm -rf pegasus-tools-*
- name: Tar files
run: |
+ df -h
mv thirdparty/hadoop-bin ./
mv thirdparty/zookeeper-bin ./
rm -rf thirdparty
- find ./build/latest/src/ -name '*CMakeFiles*' -type d -exec rm -rf
"{}" +
tar -zcvhf release__builder.tar build/latest/output build/latest/bin
build/latest/src/server/test/config.ini hadoop-bin zookeeper-bin
+ df -h
- name: Upload Artifact
uses: actions/upload-artifact@v3
with:
@@ -253,6 +263,38 @@ jobs:
path: |
/github/home/.ccache
key: asan_ccache
+ - name: Free Disk Space (Ubuntu)
+ run: |
+ df -h
+ echo "Listing 100 largest packages"
+ dpkg-query -Wf '${Installed-Size}\t${Package}\n' | sort -n | tail -n
100
+ echo "Removing large packages"
+ apt-get -s autoremove
+ apt-get remove -y openjdk-11-jre-headless
+ df -h
+ echo "Listing directories"
+ du -csh /__w/*/*
+ du -csh /__t/*/*
+ du -csh /opt/*
+ du -csh /usr/local/*
+ du -csh /usr/local/lib/*
+ du -csh /usr/local/share/*
+ du -csh /usr/share/*
+ echo "AGENT_TOOLSDIRECTORY is $AGENT_TOOLSDIRECTORY"
+ echo "Removing large directories"
+ rm -rf /__t/CodeQL
+ rm -rf /__t/node
+ rm -rf /__t/PyPy
+ rm -rf /opt/ghc
+ rm -rf /usr/local/.ghcup
+ rm -rf /usr/local/graalvm
+ rm -rf /usr/local/lib/android
+ rm -rf /usr/local/lib/node_modules
+ rm -rf /usr/local/share/boost
+ rm -rf /usr/local/share/chromium
+ rm -rf /usr/local/share/powershell
+ rm -rf /usr/share/dotnet
+ df -h
- uses: dorny/paths-filter@v2
id: changes
with:
@@ -265,32 +307,53 @@ jobs:
- name: Unpack prebuilt third-parties
if: steps.changes.outputs.thirdparty == 'false'
run: |
+ # TODO(wangdan): clear unneeded files here temporarily. Later,
unneeded files could be cleared in images.
+ df -h
+ rm -f /root/thirdparties-src.zip
unzip /root/thirdparties-bin.zip -d ./thirdparty
rm -f /root/thirdparties-bin.zip
+ # TODO(wangdan): clear unneeded files here temporarily. Later,
unneeded files could be cleared in images.
+ find ./thirdparty -name '*CMakeFiles*' -type d -exec rm -rf "{}" +
+ rm -rf ./thirdparty/hadoop-bin/share/doc
+ rm -rf ./thirdparty/zookeeper-bin/docs
- name: Rebuild third-parties
if: steps.changes.outputs.thirdparty == 'true'
working-directory: thirdparty
# Build thirdparties and leave some necessary libraries and source
run: |
+ # TODO(wangdan): clear unneeded files here temporarily. Later,
unneeded files could be cleared in images.
+ df -h
+ rm -f /root/thirdparties-src.zip
mkdir build
cmake -DCMAKE_BUILD_TYPE=Release -DROCKSDB_PORTABLE=ON -B build/
cmake --build build/ -j $(nproc)
rm -rf build/Build build/Download/[a-y]* build/Source/[a-g]*
build/Source/[i-q]* build/Source/[s-z]*
+ # TODO(wangdan): clear unneeded files here temporarily. Later,
unneeded files could be cleared in images.
+ find ./ -name '*CMakeFiles*' -type d -exec rm -rf "{}" +
../scripts/download_hadoop.sh hadoop-bin
../scripts/download_zk.sh zookeeper-bin
+ # TODO(wangdan): clear unneeded files here temporarily. Later,
unneeded files could be cleared in images.
+ rm -rf hadoop-bin/share/doc
+ rm -rf zookeeper-bin/docs
- name: Compilation
run: |
+ df -h
ccache -p
ccache -z
./run.sh build --test --sanitizer address --skip_thirdparty
--disable_gperf -j $(nproc)
ccache -s
+ - name: Clear Build Files
+ run: |
+ df -h
+ find ./build/latest/src/ -name '*CMakeFiles*' -type d -exec rm -rf
"{}" +
- name: Tar files
run: |
+ df -h
mv thirdparty/hadoop-bin ./
mv thirdparty/zookeeper-bin ./
rm -rf thirdparty
- find ./build/latest/src/ -name '*CMakeFiles*' -type d -exec rm -rf
"{}" +
tar -zcvhf release_address_builder.tar build/latest/output
build/latest/bin build/latest/src/server/test/config.ini hadoop-bin
zookeeper-bin
+ df -h
- name: Upload Artifact
uses: actions/upload-artifact@v3
with:
@@ -419,12 +482,15 @@ jobs:
# ccache -z
# ./run.sh build --test --sanitizer undefined --skip_thirdparty
--disable_gperf -j $(nproc)
# ccache -s
+# - name: Clear Build Files
+# run: |
+# df -h
+# find ./build/latest/src/ -name '*CMakeFiles*' -type d -exec rm -rf
"{}" +
# - name: Tar files
# run: |
# mv thirdparty/hadoop-bin ./
# mv thirdparty/zookeeper-bin ./
# rm -rf thirdparty
-# find ./build/latest/src/ -name '*CMakeFiles*' -type d -exec rm -rf
"{}" +
# tar -zcvhf release_undefined_builder.tar build/latest/output
build/latest/bin build/latest/src/server/test/config.ini hadoop-bin
zookeeper-bin
# - name: Upload Artifact
# uses: actions/upload-artifact@v3
@@ -528,6 +594,7 @@ jobs:
- name: Unpack prebuilt third-parties
if: steps.changes.outputs.thirdparty == 'false'
run: |
+ df -h
unzip /root/thirdparties-bin.zip -d ./thirdparty
rm -f /root/thirdparties-bin.zip
- name: Rebuild third-parties
@@ -535,6 +602,7 @@ jobs:
working-directory: thirdparty
# Build thirdparties and leave some necessary libraries and source
run: |
+ df -h
mkdir build
cmake -DCMAKE_BUILD_TYPE=Release -DROCKSDB_PORTABLE=ON
-DUSE_JEMALLOC=ON -B build/
cmake --build build/ -j $(nproc)
@@ -543,25 +611,33 @@ jobs:
../scripts/download_zk.sh zookeeper-bin
- name: Compilation
run: |
+ df -h
ccache -p
ccache -z
./run.sh build --test --skip_thirdparty -j $(nproc) -t release
--use_jemalloc
ccache -s
+ - name: Clear Build Files
+ run: |
+ df -h
+ find ./build/latest/src/ -name '*CMakeFiles*' -type d -exec rm -rf
"{}" +
- name: Pack Server
run: |
+ df -h
./run.sh pack_server -j
rm -rf pegasus-server-*
- name: Pack Tools
- run:
+ run: |
+ df -h
./run.sh pack_tools -j
rm -rf pegasus-tools-*
- name: Tar files
run: |
+ df -h
mv thirdparty/hadoop-bin ./
mv thirdparty/zookeeper-bin ./
rm -rf thirdparty
- find ./build/latest/src/ -name '*CMakeFiles*' -type d -exec rm -rf
"{}" +
tar -zcvhf release_jemalloc_builder.tar build/latest/output
build/latest/bin build/latest/src/server/test/config.ini hadoop-bin
zookeeper-bin
+ df -h
- name: Upload Artifact
uses: actions/upload-artifact@v3
with:
diff --git a/src/common/fs_manager.cpp b/src/common/fs_manager.cpp
index 2564737bb..880d8a0d6 100644
--- a/src/common/fs_manager.cpp
+++ b/src/common/fs_manager.cpp
@@ -85,7 +85,7 @@ namespace {
metric_entity_ptr instantiate_disk_metric_entity(const std::string &tag,
const std::string &data_dir)
{
- auto entity_id = fmt::format("disk_{}", tag);
+ auto entity_id = fmt::format("disk@{}", tag);
return METRIC_ENTITY_disk.instantiate(entity_id, {{"tag", tag},
{"data_dir", data_dir}});
}
diff --git a/src/meta/meta_backup_service.cpp b/src/meta/meta_backup_service.cpp
index a746c7bdb..b7e443efe 100644
--- a/src/meta/meta_backup_service.cpp
+++ b/src/meta/meta_backup_service.cpp
@@ -69,7 +69,7 @@ namespace {
metric_entity_ptr instantiate_backup_policy_metric_entity(const std::string
&policy_name)
{
- auto entity_id = fmt::format("backup_policy_{}", policy_name);
+ auto entity_id = fmt::format("backup_policy@{}", policy_name);
return METRIC_ENTITY_backup_policy.instantiate(entity_id, {{"policy_name",
policy_name}});
}
diff --git a/src/meta/table_metrics.cpp b/src/meta/table_metrics.cpp
index 73b45d69e..6801c97fc 100644
--- a/src/meta/table_metrics.cpp
+++ b/src/meta/table_metrics.cpp
@@ -112,7 +112,7 @@ namespace {
metric_entity_ptr instantiate_partition_metric_entity(int32_t table_id,
int32_t partition_id)
{
- auto entity_id = fmt::format("partition_{}", gpid(table_id, partition_id));
+ auto entity_id = fmt::format("partition@{}", gpid(table_id, partition_id));
return METRIC_ENTITY_partition.instantiate(
entity_id,
@@ -121,7 +121,7 @@ metric_entity_ptr
instantiate_partition_metric_entity(int32_t table_id, int32_t
metric_entity_ptr instantiate_table_metric_entity(int32_t table_id)
{
- auto entity_id = fmt::format("table_{}", table_id);
+ auto entity_id = fmt::format("table@{}", table_id);
return METRIC_ENTITY_table.instantiate(entity_id, {{"table_id",
std::to_string(table_id)}});
}
diff --git a/src/replica/replica_base.cpp b/src/replica/replica_base.cpp
index 3168ad651..306591119 100644
--- a/src/replica/replica_base.cpp
+++ b/src/replica/replica_base.cpp
@@ -30,7 +30,7 @@ namespace {
metric_entity_ptr instantiate_replica_metric_entity(const gpid &id)
{
- auto entity_id = fmt::format("replica_{}", id);
+ auto entity_id = fmt::format("replica@{}", id);
// Do NOT add `replica_base._app_name` as the table name to the attributes
of entity, since
// it is read-only and will never be updated even if the table is renamed.
diff --git a/src/runtime/profiler.cpp b/src/runtime/profiler.cpp
index 1a6a2bb7b..ebedb6536 100644
--- a/src/runtime/profiler.cpp
+++ b/src/runtime/profiler.cpp
@@ -361,7 +361,7 @@ namespace {
metric_entity_ptr instantiate_profiler_metric_entity(const std::string
&task_name)
{
- auto entity_id = fmt::format("task_{}", task_name);
+ auto entity_id = fmt::format("task@{}", task_name);
return METRIC_ENTITY_profiler.instantiate(entity_id, {{"task_name",
task_name}});
}
diff --git a/src/runtime/task/task_queue.cpp b/src/runtime/task/task_queue.cpp
index 425880909..f7bcb641d 100644
--- a/src/runtime/task/task_queue.cpp
+++ b/src/runtime/task/task_queue.cpp
@@ -63,7 +63,7 @@ namespace {
metric_entity_ptr instantiate_queue_metric_entity(const std::string
&queue_name)
{
- auto entity_id = fmt::format("queue_{}", queue_name);
+ auto entity_id = fmt::format("queue@{}", queue_name);
return METRIC_ENTITY_queue.instantiate(entity_id, {{"queue_name",
queue_name}});
}
diff --git a/src/utils/latency_tracer.cpp b/src/utils/latency_tracer.cpp
index 3d5e6fe48..f60d4b004 100644
--- a/src/utils/latency_tracer.cpp
+++ b/src/utils/latency_tracer.cpp
@@ -20,15 +20,23 @@
#include <fmt/core.h>
#include <cstdint>
#include <iterator>
+#include <set>
#include <utility>
-#include "perf_counter/perf_counter.h"
-#include "perf_counter/perf_counters.h"
#include "runtime/api_layer1.h"
#include "utils/autoref_ptr.h"
#include "utils/config_api.h"
#include "utils/flags.h"
#include "utils/fmt_logging.h"
+#include "utils/metrics.h"
+#include "utils/string_view.h"
+
+METRIC_DEFINE_entity(latency_tracer);
+
+METRIC_DEFINE_percentile_int64(latency_tracer,
+ latency_tracer_duration_ns,
+ dsn::metric_unit::kNanoSeconds,
+ "The duration between two points(stages)");
namespace dsn {
namespace utils {
@@ -42,48 +50,131 @@ DSN_TAG_VARIABLE(enable_latency_tracer, FT_MUTABLE);
DSN_DEFINE_bool(replication,
enable_latency_tracer_report,
false,
- "whether open the latency tracer report perf counter");
+ "whether open the latency tracer report for metrics");
DSN_TAG_VARIABLE(enable_latency_tracer_report, FT_MUTABLE);
-DSN_DEFINE_string(replication,
- latency_tracer_counter_name_prefix,
- "trace_latency",
- "perf counter common name prefix");
+namespace {
-utils::rw_lock_nr counter_lock; //{
-std::unordered_map<std::string, perf_counter_ptr> counters_trace_latency;
-// }
+#define LATENCY_TRACER_METRIC_ENTITY_ID(description, starting_point,
end_point) \
+ fmt::format("latency_tracer@{}|{}|{}", description, starting_point,
end_point)
-utils::rw_lock_nr task_code_lock; //{
-std::unordered_map<std::string, bool> task_codes;
-// }
+metric_entity_ptr instantiate_latency_tracer_metric_entity(const std::string
&description,
+ const std::string
&starting_point,
+ const std::string
&end_point)
+{
+ auto entity_id = LATENCY_TRACER_METRIC_ENTITY_ID(description,
starting_point, end_point);
+
+ return METRIC_ENTITY_latency_tracer.instantiate(entity_id,
+ {{"description",
description},
+ {"starting_point",
starting_point},
+ {"end_point",
end_point}});
+}
-perf_counter_ptr get_trace_counter(const std::string &name)
+// Maintain each latency-tracer-level metric entity, and all metrics attached
to it.
+class latency_tracer_metrics
{
- {
- utils::auto_read_lock read(counter_lock);
- auto iter = counters_trace_latency.find(name);
- if (iter != counters_trace_latency.end()) {
- return iter->second;
- }
- }
+public:
+ latency_tracer_metrics(const std::string &description,
+ const std::string &starting_point,
+ const std::string &end_point);
+ ~latency_tracer_metrics() = default;
+
+ const metric_entity_ptr &latency_tracer_metric_entity() const;
+
+ METRIC_DEFINE_SET(latency_tracer_duration_ns, int64_t)
+
+private:
+ const std::string _description;
+ const std::string _starting_point;
+ const std::string _end_point;
+
+ const dsn::metric_entity_ptr _latency_tracer_metric_entity;
+ METRIC_VAR_DECLARE_percentile_int64(latency_tracer_duration_ns);
+
+ DISALLOW_COPY_AND_ASSIGN(latency_tracer_metrics);
+};
+
+latency_tracer_metrics::latency_tracer_metrics(const std::string &description,
+ const std::string
&starting_point,
+ const std::string &end_point)
+ : _description(description),
+ _starting_point(starting_point),
+ _end_point(end_point),
+ _latency_tracer_metric_entity(
+ instantiate_latency_tracer_metric_entity(description,
starting_point, end_point)),
+ METRIC_VAR_INIT_latency_tracer(latency_tracer_duration_ns)
+{
+}
- utils::auto_write_lock write(counter_lock);
- auto iter = counters_trace_latency.find(name);
- if (iter != counters_trace_latency.end()) {
- return iter->second;
+const dsn::metric_entity_ptr
&latency_tracer_metrics::latency_tracer_metric_entity() const
+{
+ CHECK_NOTNULL(_latency_tracer_metric_entity,
+ "latency_tracer metric entity (description={},
starting_point={}, end_point={}) "
+ "should has been instantiated: uninitialized entity cannot
be used to "
+ "instantiate metric",
+ _description,
+ _starting_point,
+ _end_point);
+ return _latency_tracer_metric_entity;
+}
+
+// Manage the lifetime of all latency-tracer-level metric entities.
+class latency_tracer_metric_entities
+{
+public:
+ using entity_map = std::unordered_map<std::string,
std::unique_ptr<latency_tracer_metrics>>;
+
+ latency_tracer_metric_entities() = default;
+ ~latency_tracer_metric_entities() = default;
+
+// Acquire read lock firstly, since once the metric entity were created, there
would be no need to
+// acquire write lock again.
+#define __METRIC_DEFINE_SET(name, value_type)
\
+ void METRIC_FUNC_NAME_SET(name)(const std::string &description,
\
+ const std::string &starting_point,
\
+ const std::string &end_point,
\
+ value_type value)
\
+ {
\
+ auto entity_id = LATENCY_TRACER_METRIC_ENTITY_ID(description,
starting_point, end_point); \
+ {
\
+ dsn::utils::auto_read_lock l(_lock);
\
+ auto iter = _entities.find(entity_id);
\
+ if (dsn_likely(iter != _entities.end())) {
\
+ METRIC_SET(*(iter->second), name, value);
\
+ return;
\
+ }
\
+ }
\
+
\
+ dsn::utils::auto_write_lock l(_lock);
\
+ auto iter = _entities.find(entity_id);
\
+ if (dsn_unlikely(iter != _entities.end())) {
\
+ METRIC_SET(*(iter->second), name, value);
\
+ return;
\
+ }
\
+
\
+ auto ret = _entities.emplace(
\
+ entity_id,
\
+ std::make_unique<latency_tracer_metrics>(description,
starting_point, end_point)); \
+ CHECK_TRUE(ret.second);
\
+ METRIC_SET(*(ret.first->second), name, value);
\
}
- auto perf_counter =
-
dsn::perf_counters::instance().get_app_counter(FLAGS_latency_tracer_counter_name_prefix,
- name.c_str(),
-
COUNTER_TYPE_NUMBER_PERCENTILES,
- name.c_str(),
- true);
+ __METRIC_DEFINE_SET(latency_tracer_duration_ns, int64_t)
- counters_trace_latency.emplace(name, perf_counter);
- return perf_counter;
-}
+#undef __METRIC_DEFINE_SET
+
+private:
+ mutable utils::rw_lock_nr _lock;
+ entity_map _entities;
+
+ DISALLOW_COPY_AND_ASSIGN(latency_tracer_metric_entities);
+};
+
+latency_tracer_metric_entities s_latency_tracer_metric_entities;
+
+dsn::utils::rw_lock_nr task_code_lock; //{
+std::unordered_map<std::string, bool> task_codes;
+// }
bool is_enable_trace(const dsn::task_code &code)
{
@@ -118,6 +209,8 @@ bool is_enable_trace(const dsn::task_code &code)
return enable_trace;
}
+} // anonymous namespace
+
latency_tracer::latency_tracer(bool is_sub,
std::string name,
uint64_t threshold,
@@ -242,9 +335,12 @@ void latency_tracer::dump_trace_points(/*out*/ std::string
&traces)
auto total_latency = point.first - start_time;
if (FLAGS_enable_latency_tracer_report) {
- std::string counter_name =
- fmt::format("[{}]{}@{}", _description,
previous_point_name, cur_point_name);
- report_trace_point(counter_name, span_duration);
+ METRIC_SET(s_latency_tracer_metric_entities,
+ latency_tracer_duration_ns,
+ _description,
+ previous_point_name,
+ cur_point_name,
+ span_duration);
}
if (total_time_used >= _threshold) {
@@ -281,13 +377,5 @@ void latency_tracer::dump_trace_points(/*out*/ std::string
&traces)
}
}
-void latency_tracer::report_trace_point(const std::string &name, uint64_t span)
-{
- auto perf_counter = get_trace_counter(name);
- if (perf_counter) {
- perf_counter->set(span);
- }
-}
-
} // namespace utils
} // namespace dsn
diff --git a/src/utils/latency_tracer.h b/src/utils/latency_tracer.h
index 226536e01..459786645 100644
--- a/src/utils/latency_tracer.h
+++ b/src/utils/latency_tracer.h
@@ -165,9 +165,6 @@ public:
bool enabled() const { return _enable_trace; }
private:
- // report the trace point duration to monitor system
- static void report_trace_point(const std::string &name, uint64_t span);
-
// dump and print the trace point into log file
void dump_trace_points(/*out*/ std::string &traces);
diff --git a/src/utils/metrics.h b/src/utils/metrics.h
index e484e4c1d..3627b078f 100644
--- a/src/utils/metrics.h
+++ b/src/utils/metrics.h
@@ -193,6 +193,8 @@ class error_code;
#define METRIC_VAR_INIT_backup_policy(name, ...) METRIC_VAR_INIT(name,
backup_policy, ##__VA_ARGS__)
#define METRIC_VAR_INIT_queue(name, ...) METRIC_VAR_INIT(name, queue,
##__VA_ARGS__)
#define METRIC_VAR_ASSIGN_profiler(name, ...) METRIC_VAR_ASSIGN(name,
profiler, ##__VA_ARGS__)
+#define METRIC_VAR_INIT_latency_tracer(name, ...)
\
+ METRIC_VAR_INIT(name, latency_tracer, ##__VA_ARGS__)
// Perform increment_by() operations on gauges and counters.
#define METRIC_VAR_INCREMENT_BY(name, x)
\
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]