This is an automated email from the ASF dual-hosted git repository.
vatamane pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/couchdb.git
The following commit(s) were added to refs/heads/main by this push:
new 8f58593a1 Improve couch_stats loading
8f58593a1 is described below
commit 8f58593a15ba3ca104a9fe22d8b1df8f9ed8632a
Author: Nick Vatamaniuc <[email protected]>
AuthorDate: Thu Feb 19 00:46:53 2026 -0500
Improve couch_stats loading
There are two main improvements:
1) Remove dynamic runtime reloading. That feature is not useful any longer
after we stopped doing online relups, and instead became a potential
landmine
that can lead to metrics unexpectedly faling if stats description files, or
disk access to them changes at runtime. The only minor benefit previous
behavior had was was for local dev/run development, coupled with online
module
reloading; but that hardly justifies the complexity and the confusion
resulting
from stats all of the sudden breaking in production.
2) Strengthen a few asserts during stats loading. Only accept missing stats
description from applications, and stop on any other error. We don't want to
run with stats missing or unloadable or constantly spewing "unknown metric"
error.
---
src/couch_stats/src/couch_stats.erl | 13 +--
src/couch_stats/src/couch_stats_server.erl | 134 ++++++-----------------------
src/couch_stats/src/couch_stats_util.erl | 115 +++++++++++++++----------
3 files changed, 94 insertions(+), 168 deletions(-)
diff --git a/src/couch_stats/src/couch_stats.erl
b/src/couch_stats/src/couch_stats.erl
index 29a402449..c761614c8 100644
--- a/src/couch_stats/src/couch_stats.erl
+++ b/src/couch_stats/src/couch_stats.erl
@@ -14,7 +14,6 @@
-export([
fetch/0,
- reload/0,
sample/1,
increment_counter/1,
increment_counter/2,
@@ -33,9 +32,6 @@ fetch() ->
% Last -1 is because the interval ends are inclusive
couch_stats_util:fetch(stats(), StartSec, Seconds).
-reload() ->
- couch_stats_server:reload().
-
-spec sample(any()) -> stat().
sample(Name) ->
Seconds = couch_stats_util:histogram_interval_sec(),
@@ -112,7 +108,6 @@ couch_stats_test_() ->
[
?TDEF_FE(t_fetch_metrics),
?TDEF_FE(t_sample_metrics),
- ?TDEF_FE(t_reload),
?TDEF_FE(t_increment_counter),
?TDEF_FE(t_decrement_counter),
?TDEF_FE(t_update_gauge),
@@ -144,10 +139,6 @@ t_sample_metrics(_) ->
?assertEqual(0, sample([couch_replicator, jobs, total])).
-t_reload(_) ->
- % This is tested in detail in couch_stats_server.
- ?assertEqual(ok, reload()).
-
t_increment_counter(_) ->
[increment_counter([fsync, count]) || _ <- lists:seq(1, 1000)],
?assert(sample([fsync, count]) > 1000).
@@ -190,8 +181,6 @@ t_access_invalid_metrics(_) ->
?assertEqual({error, invalid_metric}, decrement_counter([fsync, time],
100)),
?assertEqual({error, invalid_metric}, update_gauge([fsync, count], 100)),
?assertEqual({error, invalid_metric}, update_histogram([fsync, count],
100)),
- ?assertThrow({invalid_metric, _}, update_histogram([fsync, count], Fun)),
- InvalidMetrics = #{[bad] => {invalid, <<"desc">>}},
- ?assertThrow({unknown_metric, _},
couch_stats_util:create_metrics(InvalidMetrics)).
+ ?assertThrow({invalid_metric, _}, update_histogram([fsync, count], Fun)).
-endif.
diff --git a/src/couch_stats/src/couch_stats_server.erl
b/src/couch_stats/src/couch_stats_server.erl
index ebbc9f162..640c1fb23 100644
--- a/src/couch_stats/src/couch_stats_server.erl
+++ b/src/couch_stats/src/couch_stats_server.erl
@@ -10,21 +10,14 @@
% License for the specific language governing permissions and limitations under
% the License.
-% couch_stats_server is in charge of:
-% - Initial metric loading from application stats descriptions.
-% - Recycling(resetting to 0) stale histogram counters.
-% - Checking and reloading if stats descriptions change.
-% - Checking and reloading if histogram interval config value changes.
+% couch_stats_server is in charge of initially loading stats definition into a
+% persistent term then recycling(resetting to 0) stale histogram counters.
%
-module(couch_stats_server).
-behaviour(gen_server).
--export([
- reload/0
-]).
-
-export([
start_link/0,
init/1,
@@ -33,45 +26,53 @@
handle_info/2
]).
--define(RELOAD_INTERVAL_SEC, 600).
+% config_listener
+-export([
+ handle_config_change/5,
+ handle_config_terminate/3
+]).
-record(st, {
- hist_interval,
histograms,
- clean_tref,
- reload_tref
+ clean_tref
}).
-reload() ->
- gen_server:call(?MODULE, reload).
-
start_link() ->
gen_server:start_link({local, ?MODULE}, ?MODULE, [], []).
init([]) ->
+ ok = config:listen_for_changes(?MODULE, self()),
St = #st{
- hist_interval = config:get("stats", "interval"),
clean_tref = erlang:send_after(clean_msec(), self(), clean),
- reload_tref = erlang:send_after(reload_msec(), self(), reload)
+ histograms = couch_stats_util:histograms(couch_stats_util:load())
},
- {_, Stats} = try_reload(St),
- {ok, St#st{histograms = couch_stats_util:histograms(Stats)}}.
+ {ok, St}.
-handle_call(reload, _From, #st{} = St) ->
- {reply, ok, do_reload(St)};
handle_call(Msg, _From, #st{} = St) ->
{stop, {unknown_call, Msg}, unknown_call, St}.
handle_cast(Msg, #st{} = St) ->
{stop, {unknown_cast, Msg}, St}.
-handle_info(reload, #st{} = St) ->
- {noreply, do_reload(St)};
handle_info(clean, #st{} = St) ->
{noreply, do_clean(St)};
+handle_info(restart_config_listener, #st{} = St) ->
+ ok = config:listen_for_changes(?MODULE, self()),
+ {noreply, St};
handle_info(Msg, #st{} = St) ->
{stop, {unknown_info, Msg}, St}.
+handle_config_change("stats", "interval", _, _, Pid) ->
+ Pid ! clean,
+ {ok, Pid};
+handle_config_change(_, _, _, _, Pid) ->
+ {ok, Pid}.
+
+handle_config_terminate(_, stop, _) ->
+ ok;
+handle_config_terminate(_Server, _Reason, Pid) ->
+ erlang:send_after(1000, Pid, restart_config_listener).
+
do_clean(#st{} = St) ->
timer:cancel(St#st.clean_tref),
HistTRef = erlang:send_after(clean_msec(), self(), clean),
@@ -97,50 +98,6 @@ do_clean(#st{} = St) ->
),
St#st{clean_tref = HistTRef}.
-do_reload(#st{} = St) ->
- timer:cancel(St#st.reload_tref),
- RTRef = erlang:send_after(reload_msec(), self(), reload),
- case try_reload(St) of
- {true, NewStats} ->
- timer:cancel(St#st.clean_tref),
- Histograms = couch_stats_util:histograms(NewStats),
- HTRef = erlang:send_after(clean_msec(), self(), clean),
- St#st{
- histograms = Histograms,
- clean_tref = HTRef,
- reload_tref = RTRef,
- hist_interval = config:get("stats", "interval")
- };
- {false, _} ->
- St#st{reload_tref = RTRef}
- end.
-
-try_reload(#st{} = St) ->
- NewDefs = couch_stats_util:load_metrics_for_applications(),
- Stats = couch_stats_util:stats(),
- MetricsChanged = couch_stats_util:metrics_changed(Stats, NewDefs),
- IntervalChanged = interval_changed(St),
- case MetricsChanged orelse IntervalChanged of
- true ->
- couch_stats_util:reset_histogram_interval_sec(),
- NewStats = couch_stats_util:create_metrics(NewDefs),
- couch_stats_util:replace_stats(NewStats),
- {true, NewStats};
- false ->
- {false, Stats}
- end.
-
-interval_changed(#st{hist_interval = OldInterval}) ->
- case config:get("stats", "interval") of
- Interval when OldInterval =:= Interval ->
- false;
- _ ->
- true
- end.
-
-reload_msec() ->
- 1000 * ?RELOAD_INTERVAL_SEC.
-
clean_msec() ->
% We want to wake up more often than our interval so we decide to wake
% about twice as often. If the interval is 10 seconds, we'd wake up every 5
@@ -152,8 +109,6 @@ clean_msec() ->
-include_lib("couch/include/couch_eunit.hrl").
--define(TIMEOUT, 10).
-
couch_stats_server_test_() ->
{
foreach,
@@ -161,8 +116,6 @@ couch_stats_server_test_() ->
fun teardown/1,
[
?TDEF_FE(t_server_starts),
- ?TDEF_FE(t_reload_with_no_changes_works, 10),
- ?TDEF_FE(t_reload_with_changes_works, 10),
?TDEF_FE(t_cleaning_works, 10),
?TDEF_FE(t_invalid_call),
?TDEF_FE(t_invalid_cast),
@@ -180,50 +133,13 @@ teardown(Ctx) ->
t_server_starts(_) ->
?assert(is_process_alive(whereis(?MODULE))).
-t_reload_with_no_changes_works(_) ->
- Pid = whereis(?MODULE),
- ?assert(is_process_alive(Pid)),
- ?assertEqual(ok, reload()),
- ?assertEqual(Pid, whereis(?MODULE)),
- ?assert(is_process_alive(Pid)),
- % Let's reload a few more hundred times
- lists:foreach(
- fun(_) ->
- ?assertEqual(ok, reload()),
- ?assertEqual(Pid, whereis(?MODULE)),
- ?assert(is_process_alive(Pid))
- end,
- lists:seq(1, 100)
- ).
-
-t_reload_with_changes_works(_) ->
- Pid = whereis(?MODULE),
- ?assert(is_process_alive(Pid)),
- #st{hist_interval = Interval0} = sys:get_state(Pid),
- ?assertEqual(undefined, Interval0),
-
- config:set("stats", "interval", "7", false),
- ?assertEqual(ok, reload()),
- ?assertEqual(Pid, whereis(?MODULE)),
- ?assert(is_process_alive(Pid)),
- #st{hist_interval = Interval1} = sys:get_state(Pid),
- ?assertEqual("7", Interval1),
-
- #st{histograms = Hists} = sys:get_state(Pid),
- [{_Key, {histogram, HCtx1, _Desc}} | _] = maps:to_list(Hists),
- % Histogram window size should now be shorter
- % 7 (active time window) + 7 (stale) + 5 + 5 for buffers = 24.
- ?assertEqual(24, tuple_size(HCtx1)).
-
t_cleaning_works(_) ->
config:set("stats", "interval", "1", false),
sys:log(?MODULE, {true, 100}),
- ok = reload(),
timer:sleep(2000),
{ok, Events} = sys:log(?MODULE, get),
ok = sys:log(?MODULE, false),
config:set("stats", "interval", "10", false),
- ok = reload(),
% Events looks like: [{in, Msg} | {noreply, ...} | {out, ..}, ...]
CleanEvents = [clean || {in, clean} <- Events],
?assert(length(CleanEvents) >= 3).
diff --git a/src/couch_stats/src/couch_stats_util.erl
b/src/couch_stats/src/couch_stats_util.erl
index 5cbd54b1d..a63d38023 100644
--- a/src/couch_stats/src/couch_stats_util.erl
+++ b/src/couch_stats/src/couch_stats_util.erl
@@ -14,9 +14,7 @@
-export([
% Load metrics from apps
- create_metrics/1,
- load_metrics_for_applications/0,
- metrics_changed/2,
+ load/0,
% Get various metric types
get_counter/2,
@@ -26,10 +24,8 @@
% Get histogram interval config settings
histogram_interval_sec/0,
histogram_safety_buffer_size_sec/0,
- reset_histogram_interval_sec/0,
- % Manage the main stats (metrics) persistent term map
- replace_stats/1,
+ % Get the stats persistent term map
stats/0,
histograms/1,
@@ -38,6 +34,8 @@
sample/4
]).
+-include_lib("stdlib/include/assert.hrl").
+
-define(DEFAULT_INTERVAL_SEC, 10).
% Histogram types
@@ -51,41 +49,77 @@
% Persistent term keys
-define(STATS_KEY, {?MODULE, stats}).
--define(HIST_TIME_INTERVAL_KEY, {?MODULE, hist_time_interval}).
+
+% Don't waste time looking for stats definition in some built-in and dependency
+% apps. This doesn't have to be an exhaustive list, it's just to avoid doing
+% extra work.
+%
+-define(SKIP_APPS, [
+ asn1,
+ b64url,
+ compiler,
+ cowlib,
+ crypto,
+ gun,
+ ibrowse,
+ inets,
+ jiffy,
+ kernel,
+ meck,
+ mochiweb,
+ os_mon,
+ public_key,
+ rebar,
+ rebar3,
+ recon,
+ runtime_tools,
+ sasl,
+ snappy,
+ ssl,
+ stdlib,
+ syntax_tools,
+ xmerl
+]).
+
+load() ->
+ Definitions = load_metrics_for_applications(),
+ Stats = create_metrics(Definitions),
+ persistent_term:put(?STATS_KEY, Stats),
+ Stats.
load_metrics_for_applications() ->
Apps = [element(1, A) || A <- application:loaded_applications()],
- lists:foldl(fun load_metrics_for_application_fold/2, #{}, Apps).
+ Apps1 = [A || A <- Apps, not lists:member(A, ?SKIP_APPS)],
+ lists:foldl(fun load_metrics_for_application_fold/2, #{}, Apps1).
load_metrics_for_application_fold(AppName, #{} = Acc) ->
- case code:priv_dir(AppName) of
- {error, _Error} ->
+ % For an existing application we should always be able to compute its
+ % priv_dir path, even though the directory itself may not exist or may not
+ % be accessible.
+ Dir = code:priv_dir(AppName),
+ ?assert(is_list(Dir), "Could not get application priv_dir " ++
atom_to_list(AppName)),
+ Path = filename:join(Dir, "stats_descriptions.cfg"),
+ % Expect some apps not to have stats descriptions and priv_dir paths to
not even exist
+ case file:consult(Path) of
+ {ok, Descriptions} ->
+ DescMap = maps:map(
+ fun(_, TypeDesc) ->
+ Type = proplists:get_value(type, TypeDesc, counter),
+ Desc = proplists:get_value(desc, TypeDesc, <<>>),
+ {Type, Desc}
+ end,
+ maps:from_list(Descriptions)
+ ),
+ maps:merge(Acc, DescMap);
+ {error, enoent} ->
Acc;
- Dir ->
- case file:consult(Dir ++ "/stats_descriptions.cfg") of
- {ok, Descriptions} ->
- DescMap = maps:map(
- fun(_, TypeDesc) ->
- Type = proplists:get_value(type, TypeDesc,
counter),
- Desc = proplists:get_value(desc, TypeDesc, <<>>),
- {Type, Desc}
- end,
- maps:from_list(Descriptions)
- ),
- maps:merge(Acc, DescMap);
- {error, _Error} ->
- Acc
- end
+ {error, enotdir} ->
+ Acc;
+ {error, Error} ->
+ % Bail if we can't load stats for any other reason
+ error({couch_stats_load_error, Path, Error})
end.
-metrics_changed(#{} = Map1, #{} = Map2) when map_size(Map1) =/= map_size(Map2)
->
- % If their sizes are differently they are obvioulsy not the same
- true;
-metrics_changed(#{} = Map1, #{} = Map2) when map_size(Map1) =:= map_size(Map2)
->
- % If their intersection size is not the same as their individual size
- % they are also not the same
- map_size(maps:intersect(Map1, Map2)) =/= map_size(Map1).
-
get_counter(Name, #{} = Stats) ->
get_metric(Name, ?CNTR, Stats).
@@ -108,17 +142,7 @@ get_metric(Name, Type, Stats) when is_atom(Type),
is_map(Stats) ->
end.
histogram_interval_sec() ->
- case persistent_term:get(?HIST_TIME_INTERVAL_KEY, not_cached) of
- not_cached ->
- Time = config:get_integer("stats", "interval",
?DEFAULT_INTERVAL_SEC),
- persistent_term:put(?HIST_TIME_INTERVAL_KEY, Time),
- Time;
- Val when is_integer(Val) ->
- Val
- end.
-
-reset_histogram_interval_sec() ->
- persistent_term:erase(?HIST_TIME_INTERVAL_KEY).
+ config:get_integer("stats", "interval", ?DEFAULT_INTERVAL_SEC).
histogram_safety_buffer_size_sec() ->
?HIST_WRAP_BUFFER_SIZE_SEC.
@@ -128,9 +152,6 @@ histogram_total_size_sec() ->
% periodically clear.
histogram_interval_sec() * 2 + ?HIST_WRAP_BUFFER_SIZE_SEC * 2.
-replace_stats(#{} = Stats) ->
- persistent_term:put(?STATS_KEY, Stats).
-
stats() ->
persistent_term:get(?STATS_KEY, #{}).