This is an automated email from the ASF dual-hosted git repository. willholley pushed a commit to branch prometheus_erlang_dist in repository https://gitbox.apache.org/repos/asf/couchdb.git
commit 7d4f0a6702e2540c8d2a4a3e2d88331d3af6edae Author: Will Holley <[email protected]> AuthorDate: Wed Apr 12 15:08:45 2023 +0000 feat (prometheus): couch_db_updater and couch_file queue stats # What Adds summary metrics for couch_db_updater and couch_file, the same as returned by the `_system` endpoint. Unlike the other message queue stats, these are returned as a Prometheus summary type across the following metrics, using `couch_db_updater` as an example: * couchdb_erlang_message_queue_couch_db_updater{quantile="0.5"} * couchdb_erlang_message_queue_couch_db_updater{quantile="0.9"} * couchdb_erlang_message_queue_couch_db_updater{quantile="0.99"} * couchdb_erlang_message_queue_couch_db_updater_sum * couchdb_erlang_message_queue_couch_db_updater_count The count metric represents the number of processes and the sum is the total size of all message queues for those processes. In addition, min and max message queue sizes are returned, matching the _system endpoint response: * couchdb_erlang_message_queue_couch_db_updater_min * couchdb_erlang_message_queue_couch_db_updater_max # How This represents a new type of metric in the prometheus endpoint - the existing `summary` types have all been for latency histograms - so a new utility function `pid_to_prom_summary` is added to format the message queue stats into prometheus metrics series. In `chttpd_node` I've extracted the formatting step from the `db_pid_stats` function to allow for re-use between `chttpd_node` and `couch_prometheus_server`, where the result is formatted differently. `chttpd_node` doesn't seem like the best place to put shared code like this but neither does there seem an obvious place to extract it to as an alternative, so I've left it for now. --- src/chttpd/src/chttpd_node.erl | 11 +++++--- .../src/couch_prometheus_server.erl | 31 ++++++++++++++++++++++ src/couch_prometheus/src/couch_prometheus_util.erl | 1 + 3 files changed, 39 insertions(+), 4 deletions(-) diff --git a/src/chttpd/src/chttpd_node.erl b/src/chttpd/src/chttpd_node.erl index bb3cf4798..ef586e174 100644 --- a/src/chttpd/src/chttpd_node.erl +++ b/src/chttpd/src/chttpd_node.erl @@ -287,7 +287,7 @@ get_stats() -> {NumberOfGCs, WordsReclaimed, _} = statistics(garbage_collection), {{input, Input}, {output, Output}} = statistics(io), - {CF, CDU} = db_pid_stats(), + {CF, CDU} = db_pid_stats_formatted(), MessageQueuesHist = [ {couch_file, {CF}}, {couch_db_updater, {CDU}} @@ -315,6 +315,10 @@ get_stats() -> {distribution, {get_distribution_stats()}} ]. +db_pid_stats_formatted() -> + {CF, CDU} = db_pid_stats(), + {format_pid_stats(CF), format_pid_stats(CDU)}. + db_pid_stats() -> {monitors, M} = process_info(whereis(couch_stats_process_tracker), monitors), Candidates = [Pid || {process, Pid} <- M], @@ -323,7 +327,7 @@ db_pid_stats() -> {CouchFiles, CouchDbUpdaters}. db_pid_stats(Mod, Candidates) -> - Mailboxes = lists:foldl( + lists:foldl( fun(Pid, Acc) -> case process_info(Pid, [message_queue_len, dictionary]) of undefined -> @@ -343,8 +347,7 @@ db_pid_stats(Mod, Candidates) -> end, [], Candidates - ), - format_pid_stats(Mailboxes). + ). format_pid_stats([]) -> []; diff --git a/src/couch_prometheus/src/couch_prometheus_server.erl b/src/couch_prometheus/src/couch_prometheus_server.erl index 847ad947d..884d792f0 100644 --- a/src/couch_prometheus/src/couch_prometheus_server.erl +++ b/src/couch_prometheus/src/couch_prometheus_server.erl @@ -17,6 +17,7 @@ -import(couch_prometheus_util, [ couch_to_prom/3, to_prom/4, + to_prom/2, to_prom_summary/2 ]). @@ -110,6 +111,7 @@ get_system_stats() -> get_uptime_stat(), get_io_stats(), get_message_queue_stats(), + get_db_pid_stats(), get_run_queue_stats(), get_vm_stats(), get_ets_stats(), @@ -220,6 +222,35 @@ get_message_queue_stats() -> to_prom(erlang_message_queue_size, gauge, "size of message queue", QueueLenByLabel) ]. +get_db_pid_stats() -> + {CF, CDU} = chttpd_node:db_pid_stats(), + [ + pid_to_prom_summary( + "erlang_message_queue_couch_file", + "size of message queue across couch_file processes", + CF + ), + pid_to_prom_summary( + "erlang_message_queue_couch_db_updater", + "size of message queue across couch_db_updater processes", + CDU + ) + ]. + +pid_to_prom_summary(Metric, Desc, Mailboxes) -> + Sorted = lists:sort(Mailboxes), + Count = length(Sorted), + Quantiles = [ + {[{quantile, <<"0.5">>}], lists:nth(round(Count * 0.5), Sorted)}, + {[{quantile, <<"0.9">>}], lists:nth(round(Count * 0.9), Sorted)}, + {[{quantile, <<"0.99">>}], lists:nth(round(Count * 0.99), Sorted)} + ], + SumStat = to_prom(Metric ++ ["_sum"], lists:sum(Sorted)), + CountStat = to_prom(Metric ++ ["_count"], length(Sorted)), + MinStat = to_prom(Metric ++ ["_min"], hd(Sorted)), + MaxStat = to_prom(Metric ++ ["_max"], lists:nth(Count, Sorted)), + to_prom(Metric, summary, Desc, Quantiles) ++ [SumStat, CountStat, MinStat, MaxStat]. + get_run_queue_stats() -> %% Workaround for https://bugs.erlang.org/browse/ERL-1355 {SQ, DCQ} = chttpd_node:run_queues(), diff --git a/src/couch_prometheus/src/couch_prometheus_util.erl b/src/couch_prometheus/src/couch_prometheus_util.erl index 5775b9693..4665ba7f9 100644 --- a/src/couch_prometheus/src/couch_prometheus_util.erl +++ b/src/couch_prometheus/src/couch_prometheus_util.erl @@ -16,6 +16,7 @@ couch_to_prom/3, to_bin/1, to_prom/4, + to_prom/2, to_prom_summary/2 ]).
