This is an automated email from the ASF dual-hosted git repository. vatamane pushed a commit to branch remove-status-runtime-reloading in repository https://gitbox.apache.org/repos/asf/couchdb.git
commit 24d7e9d4b884e937d1b3e494ca697f274015b6d6 Author: Nick Vatamaniuc <[email protected]> AuthorDate: Thu Feb 19 00:46:53 2026 -0500 Improve couch_stats loading There are two main improvements: 1) Remove dynamic runtime reloading. That feature is not useful any longer after we stopped doing online relups, and instead became a potential landmine that can lead to metrics unexpectedly faling if stats description files, or disk access to them changes at runtime. The only minor benefit previous behavior had was was for local dev/run development, coupled with online module reloading; but that hardly justifies the complexity and the confusion resulting from stats all of the sudden breaking in production. 2) Strengthen a few asserts during stats loading. Only accept missing stats description from applications, and stop on any other error. We don't want to run with stats missing or unloadable or constantly spewing "unknown metric" error. --- src/couch_stats/src/couch_stats.erl | 13 +-- src/couch_stats/src/couch_stats_server.erl | 134 ++++++----------------------- src/couch_stats/src/couch_stats_util.erl | 79 +++++++---------- 3 files changed, 59 insertions(+), 167 deletions(-) diff --git a/src/couch_stats/src/couch_stats.erl b/src/couch_stats/src/couch_stats.erl index 29a402449..c761614c8 100644 --- a/src/couch_stats/src/couch_stats.erl +++ b/src/couch_stats/src/couch_stats.erl @@ -14,7 +14,6 @@ -export([ fetch/0, - reload/0, sample/1, increment_counter/1, increment_counter/2, @@ -33,9 +32,6 @@ fetch() -> % Last -1 is because the interval ends are inclusive couch_stats_util:fetch(stats(), StartSec, Seconds). -reload() -> - couch_stats_server:reload(). - -spec sample(any()) -> stat(). sample(Name) -> Seconds = couch_stats_util:histogram_interval_sec(), @@ -112,7 +108,6 @@ couch_stats_test_() -> [ ?TDEF_FE(t_fetch_metrics), ?TDEF_FE(t_sample_metrics), - ?TDEF_FE(t_reload), ?TDEF_FE(t_increment_counter), ?TDEF_FE(t_decrement_counter), ?TDEF_FE(t_update_gauge), @@ -144,10 +139,6 @@ t_sample_metrics(_) -> ?assertEqual(0, sample([couch_replicator, jobs, total])). -t_reload(_) -> - % This is tested in detail in couch_stats_server. - ?assertEqual(ok, reload()). - t_increment_counter(_) -> [increment_counter([fsync, count]) || _ <- lists:seq(1, 1000)], ?assert(sample([fsync, count]) > 1000). @@ -190,8 +181,6 @@ t_access_invalid_metrics(_) -> ?assertEqual({error, invalid_metric}, decrement_counter([fsync, time], 100)), ?assertEqual({error, invalid_metric}, update_gauge([fsync, count], 100)), ?assertEqual({error, invalid_metric}, update_histogram([fsync, count], 100)), - ?assertThrow({invalid_metric, _}, update_histogram([fsync, count], Fun)), - InvalidMetrics = #{[bad] => {invalid, <<"desc">>}}, - ?assertThrow({unknown_metric, _}, couch_stats_util:create_metrics(InvalidMetrics)). + ?assertThrow({invalid_metric, _}, update_histogram([fsync, count], Fun)). -endif. diff --git a/src/couch_stats/src/couch_stats_server.erl b/src/couch_stats/src/couch_stats_server.erl index ebbc9f162..640c1fb23 100644 --- a/src/couch_stats/src/couch_stats_server.erl +++ b/src/couch_stats/src/couch_stats_server.erl @@ -10,21 +10,14 @@ % License for the specific language governing permissions and limitations under % the License. -% couch_stats_server is in charge of: -% - Initial metric loading from application stats descriptions. -% - Recycling(resetting to 0) stale histogram counters. -% - Checking and reloading if stats descriptions change. -% - Checking and reloading if histogram interval config value changes. +% couch_stats_server is in charge of initially loading stats definition into a +% persistent term then recycling(resetting to 0) stale histogram counters. % -module(couch_stats_server). -behaviour(gen_server). --export([ - reload/0 -]). - -export([ start_link/0, init/1, @@ -33,45 +26,53 @@ handle_info/2 ]). --define(RELOAD_INTERVAL_SEC, 600). +% config_listener +-export([ + handle_config_change/5, + handle_config_terminate/3 +]). -record(st, { - hist_interval, histograms, - clean_tref, - reload_tref + clean_tref }). -reload() -> - gen_server:call(?MODULE, reload). - start_link() -> gen_server:start_link({local, ?MODULE}, ?MODULE, [], []). init([]) -> + ok = config:listen_for_changes(?MODULE, self()), St = #st{ - hist_interval = config:get("stats", "interval"), clean_tref = erlang:send_after(clean_msec(), self(), clean), - reload_tref = erlang:send_after(reload_msec(), self(), reload) + histograms = couch_stats_util:histograms(couch_stats_util:load()) }, - {_, Stats} = try_reload(St), - {ok, St#st{histograms = couch_stats_util:histograms(Stats)}}. + {ok, St}. -handle_call(reload, _From, #st{} = St) -> - {reply, ok, do_reload(St)}; handle_call(Msg, _From, #st{} = St) -> {stop, {unknown_call, Msg}, unknown_call, St}. handle_cast(Msg, #st{} = St) -> {stop, {unknown_cast, Msg}, St}. -handle_info(reload, #st{} = St) -> - {noreply, do_reload(St)}; handle_info(clean, #st{} = St) -> {noreply, do_clean(St)}; +handle_info(restart_config_listener, #st{} = St) -> + ok = config:listen_for_changes(?MODULE, self()), + {noreply, St}; handle_info(Msg, #st{} = St) -> {stop, {unknown_info, Msg}, St}. +handle_config_change("stats", "interval", _, _, Pid) -> + Pid ! clean, + {ok, Pid}; +handle_config_change(_, _, _, _, Pid) -> + {ok, Pid}. + +handle_config_terminate(_, stop, _) -> + ok; +handle_config_terminate(_Server, _Reason, Pid) -> + erlang:send_after(1000, Pid, restart_config_listener). + do_clean(#st{} = St) -> timer:cancel(St#st.clean_tref), HistTRef = erlang:send_after(clean_msec(), self(), clean), @@ -97,50 +98,6 @@ do_clean(#st{} = St) -> ), St#st{clean_tref = HistTRef}. -do_reload(#st{} = St) -> - timer:cancel(St#st.reload_tref), - RTRef = erlang:send_after(reload_msec(), self(), reload), - case try_reload(St) of - {true, NewStats} -> - timer:cancel(St#st.clean_tref), - Histograms = couch_stats_util:histograms(NewStats), - HTRef = erlang:send_after(clean_msec(), self(), clean), - St#st{ - histograms = Histograms, - clean_tref = HTRef, - reload_tref = RTRef, - hist_interval = config:get("stats", "interval") - }; - {false, _} -> - St#st{reload_tref = RTRef} - end. - -try_reload(#st{} = St) -> - NewDefs = couch_stats_util:load_metrics_for_applications(), - Stats = couch_stats_util:stats(), - MetricsChanged = couch_stats_util:metrics_changed(Stats, NewDefs), - IntervalChanged = interval_changed(St), - case MetricsChanged orelse IntervalChanged of - true -> - couch_stats_util:reset_histogram_interval_sec(), - NewStats = couch_stats_util:create_metrics(NewDefs), - couch_stats_util:replace_stats(NewStats), - {true, NewStats}; - false -> - {false, Stats} - end. - -interval_changed(#st{hist_interval = OldInterval}) -> - case config:get("stats", "interval") of - Interval when OldInterval =:= Interval -> - false; - _ -> - true - end. - -reload_msec() -> - 1000 * ?RELOAD_INTERVAL_SEC. - clean_msec() -> % We want to wake up more often than our interval so we decide to wake % about twice as often. If the interval is 10 seconds, we'd wake up every 5 @@ -152,8 +109,6 @@ clean_msec() -> -include_lib("couch/include/couch_eunit.hrl"). --define(TIMEOUT, 10). - couch_stats_server_test_() -> { foreach, @@ -161,8 +116,6 @@ couch_stats_server_test_() -> fun teardown/1, [ ?TDEF_FE(t_server_starts), - ?TDEF_FE(t_reload_with_no_changes_works, 10), - ?TDEF_FE(t_reload_with_changes_works, 10), ?TDEF_FE(t_cleaning_works, 10), ?TDEF_FE(t_invalid_call), ?TDEF_FE(t_invalid_cast), @@ -180,50 +133,13 @@ teardown(Ctx) -> t_server_starts(_) -> ?assert(is_process_alive(whereis(?MODULE))). -t_reload_with_no_changes_works(_) -> - Pid = whereis(?MODULE), - ?assert(is_process_alive(Pid)), - ?assertEqual(ok, reload()), - ?assertEqual(Pid, whereis(?MODULE)), - ?assert(is_process_alive(Pid)), - % Let's reload a few more hundred times - lists:foreach( - fun(_) -> - ?assertEqual(ok, reload()), - ?assertEqual(Pid, whereis(?MODULE)), - ?assert(is_process_alive(Pid)) - end, - lists:seq(1, 100) - ). - -t_reload_with_changes_works(_) -> - Pid = whereis(?MODULE), - ?assert(is_process_alive(Pid)), - #st{hist_interval = Interval0} = sys:get_state(Pid), - ?assertEqual(undefined, Interval0), - - config:set("stats", "interval", "7", false), - ?assertEqual(ok, reload()), - ?assertEqual(Pid, whereis(?MODULE)), - ?assert(is_process_alive(Pid)), - #st{hist_interval = Interval1} = sys:get_state(Pid), - ?assertEqual("7", Interval1), - - #st{histograms = Hists} = sys:get_state(Pid), - [{_Key, {histogram, HCtx1, _Desc}} | _] = maps:to_list(Hists), - % Histogram window size should now be shorter - % 7 (active time window) + 7 (stale) + 5 + 5 for buffers = 24. - ?assertEqual(24, tuple_size(HCtx1)). - t_cleaning_works(_) -> config:set("stats", "interval", "1", false), sys:log(?MODULE, {true, 100}), - ok = reload(), timer:sleep(2000), {ok, Events} = sys:log(?MODULE, get), ok = sys:log(?MODULE, false), config:set("stats", "interval", "10", false), - ok = reload(), % Events looks like: [{in, Msg} | {noreply, ...} | {out, ..}, ...] CleanEvents = [clean || {in, clean} <- Events], ?assert(length(CleanEvents) >= 3). diff --git a/src/couch_stats/src/couch_stats_util.erl b/src/couch_stats/src/couch_stats_util.erl index 5cbd54b1d..4b032c32f 100644 --- a/src/couch_stats/src/couch_stats_util.erl +++ b/src/couch_stats/src/couch_stats_util.erl @@ -14,9 +14,7 @@ -export([ % Load metrics from apps - create_metrics/1, - load_metrics_for_applications/0, - metrics_changed/2, + load/0, % Get various metric types get_counter/2, @@ -26,10 +24,8 @@ % Get histogram interval config settings histogram_interval_sec/0, histogram_safety_buffer_size_sec/0, - reset_histogram_interval_sec/0, - % Manage the main stats (metrics) persistent term map - replace_stats/1, + % Get the stats persistent term map stats/0, histograms/1, @@ -38,6 +34,8 @@ sample/4 ]). +-include_lib("stdlib/include/assert.hrl"). + -define(DEFAULT_INTERVAL_SEC, 10). % Histogram types @@ -51,41 +49,43 @@ % Persistent term keys -define(STATS_KEY, {?MODULE, stats}). --define(HIST_TIME_INTERVAL_KEY, {?MODULE, hist_time_interval}). + +load() -> + Definitions = load_metrics_for_applications(), + Stats = create_metrics(Definitions), + persistent_term:put(?STATS_KEY, Stats), + Stats. load_metrics_for_applications() -> Apps = [element(1, A) || A <- application:loaded_applications()], lists:foldl(fun load_metrics_for_application_fold/2, #{}, Apps). load_metrics_for_application_fold(AppName, #{} = Acc) -> - case code:priv_dir(AppName) of - {error, _Error} -> + % For an existing application we should always be able to compute its + % priv_dir path, even though the directory itself may not exist or may not + % be accessible. + Dir = code:priv_dir(AppName), + ?assert(is_list(Dir), "Could not get application priv_dir " ++ atom_to_list(AppName)), + Path = filename:join(Dir, "stats_descriptions.cfg"), + case file:consult(Path) of + {ok, Descriptions} -> + DescMap = maps:map( + fun(_, TypeDesc) -> + Type = proplists:get_value(type, TypeDesc, counter), + Desc = proplists:get_value(desc, TypeDesc, <<>>), + {Type, Desc} + end, + maps:from_list(Descriptions) + ), + maps:merge(Acc, DescMap); + {error, enoent} -> + % That's fine, not all applications have stats descriptions Acc; - Dir -> - case file:consult(Dir ++ "/stats_descriptions.cfg") of - {ok, Descriptions} -> - DescMap = maps:map( - fun(_, TypeDesc) -> - Type = proplists:get_value(type, TypeDesc, counter), - Desc = proplists:get_value(desc, TypeDesc, <<>>), - {Type, Desc} - end, - maps:from_list(Descriptions) - ), - maps:merge(Acc, DescMap); - {error, _Error} -> - Acc - end + {error, Error} -> + % Bail if we can't load stats for any other reason + error({couch_stats_load_error, Path, Error}) end. -metrics_changed(#{} = Map1, #{} = Map2) when map_size(Map1) =/= map_size(Map2) -> - % If their sizes are differently they are obvioulsy not the same - true; -metrics_changed(#{} = Map1, #{} = Map2) when map_size(Map1) =:= map_size(Map2) -> - % If their intersection size is not the same as their individual size - % they are also not the same - map_size(maps:intersect(Map1, Map2)) =/= map_size(Map1). - get_counter(Name, #{} = Stats) -> get_metric(Name, ?CNTR, Stats). @@ -108,17 +108,7 @@ get_metric(Name, Type, Stats) when is_atom(Type), is_map(Stats) -> end. histogram_interval_sec() -> - case persistent_term:get(?HIST_TIME_INTERVAL_KEY, not_cached) of - not_cached -> - Time = config:get_integer("stats", "interval", ?DEFAULT_INTERVAL_SEC), - persistent_term:put(?HIST_TIME_INTERVAL_KEY, Time), - Time; - Val when is_integer(Val) -> - Val - end. - -reset_histogram_interval_sec() -> - persistent_term:erase(?HIST_TIME_INTERVAL_KEY). + config:get_integer("stats", "interval", ?DEFAULT_INTERVAL_SEC). histogram_safety_buffer_size_sec() -> ?HIST_WRAP_BUFFER_SIZE_SEC. @@ -128,9 +118,6 @@ histogram_total_size_sec() -> % periodically clear. histogram_interval_sec() * 2 + ?HIST_WRAP_BUFFER_SIZE_SEC * 2. -replace_stats(#{} = Stats) -> - persistent_term:put(?STATS_KEY, Stats). - stats() -> persistent_term:get(?STATS_KEY, #{}).
