This is an automated email from the ASF dual-hosted git repository. vatamane pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/couchdb.git
commit 322fcf0c0269934bf78bab5f3b0e5961030ed462 Author: Nick Vatamaniuc <[email protected]> AuthorDate: Tue Feb 17 18:08:03 2026 -0500 Switch to hibernate_after When debugging processes getting stuck in hibernation bug [1] a few benchmarks showed that hibernation can be pretty expensive. I saw 20% or so reduction in latency in couch_work_queue if we hibernate after every single item insertion. Erlang documents warn about this [2]: > Use this feature with care, as hibernation implies at least two garbage collections (when hibernating and shortly after waking up) and is not something you want to do between each call to a busy server. In a few places like the `couch_work_queue` and `couch_db_updater` we did exactly that. However, since we added that more Erlang/OTP implemented a new `gen_server` option - `{hibernate_after, Timeout}`. It will trigger hibernation after an idle time. That seems ideal for us - it keeps expensive hibernation out of the main data path, as docs warn us about, but once the server goes idle we still get to run it to dereference any ref binaries. Since we encountered the recent hibernation bug [1] also add an option to disable it altogether, just to have a way to mitigate the issue when running on OTP 27 and 28 before the fix is out. [1] https://github.com/erlang/otp/issues/10651 [2] https://www.erlang.org/doc/apps/stdlib/gen_server.html [3] https://github.com/apache/couchdb/commit/d9eb87f60ac8328afcca25cf6c7aae53b395b089 --- rel/overlay/etc/default.ini | 17 +++++++ src/couch/src/couch_db_updater.erl | 7 +-- src/couch/src/couch_stream.erl | 23 +++++---- src/couch/src/couch_util.erl | 13 +++++ src/couch/src/couch_work_queue.erl | 18 +++---- src/couch/test/eunit/couch_util_tests.erl | 80 +++++++++++++++++++++++++++++++ src/rexi/src/rexi_buffer.erl | 10 ++-- 7 files changed, 138 insertions(+), 30 deletions(-) diff --git a/rel/overlay/etc/default.ini b/rel/overlay/etc/default.ini index d599656b1..6fbc99f58 100644 --- a/rel/overlay/etc/default.ini +++ b/rel/overlay/etc/default.ini @@ -1270,3 +1270,20 @@ url = {{nouveau_url}} ; periodically reload configuration from file. ; Set to infinity to disable. ;auto_reload_secs = infinity + +[hibernate_after] +; Some processes which handle a large number of referenced binaries can benefit +; from hibernating periodically, so they can run a complete garbage collection +; and dereference those binaries. This section configures idle hibernation +; timeouts for some of those processes. The value is milliseconds or +; "infinity". Setting the value to "infinity" disables hibernation. The option +; may be used as a mitigation strategy when running with earlier OTP 27/28 +; versionss which had a bug [1] which prevented processes from waking up from +; hibernation. +; +; [1] https://github.com/erlang/otp/issues/10651 + +;rexi_buffer = 5000 +;couch_stream = 5000 +;couch_db_updater = 5000 +;couch_work_queue = 5000 diff --git a/src/couch/src/couch_db_updater.erl b/src/couch/src/couch_db_updater.erl index 8394cd5a3..0d89c4750 100644 --- a/src/couch/src/couch_db_updater.erl +++ b/src/couch/src/couch_db_updater.erl @@ -49,7 +49,8 @@ init({Engine, DbName, FilePath, Options0}) -> % couch_db:validate_doc_update, which loads them lazily. NewDb = Db#db{main_pid = self()}, proc_lib:init_ack({ok, NewDb}), - gen_server:enter_loop(?MODULE, [], NewDb) + GenOpts = couch_util:hibernate_after(?MODULE), + gen_server:enter_loop(?MODULE, GenOpts, NewDb) catch throw:InitError -> proc_lib:init_ack(InitError) @@ -220,11 +221,11 @@ handle_info( false -> Db2 end, - {noreply, Db3, hibernate} + {noreply, Db3} catch throw:retry -> [catch (ClientPid ! {retry, self()}) || ClientPid <- Clients], - {noreply, Db, hibernate} + {noreply, Db} end; handle_info({'EXIT', _Pid, normal}, Db) -> {noreply, Db}; diff --git a/src/couch/src/couch_stream.erl b/src/couch/src/couch_stream.erl index 0e4ccdae6..f3f0bf190 100644 --- a/src/couch/src/couch_stream.erl +++ b/src/couch/src/couch_stream.erl @@ -58,7 +58,8 @@ open({_StreamEngine, _StreamEngineState} = Engine) -> open(Engine, []). open({_StreamEngine, _StreamEngineState} = Engine, Options) -> - gen_server:start_link(?MODULE, {Engine, self(), erlang:get(io_priority), Options}, []). + GenOpts = couch_util:hibernate_after(?MODULE), + gen_server:start_link(?MODULE, {Engine, self(), erlang:get(io_priority), Options}, GenOpts). close(Pid) -> gen_server:call(Pid, close, infinity). @@ -223,17 +224,15 @@ handle_call({write, Bin}, _From, Stream) -> Md5_2 = couch_hash:md5_hash_update(Md5, WriteBin2) end, - {reply, ok, - Stream#stream{ - engine = NewEngine, - written_len = WrittenLen2, - buffer_list = [], - buffer_len = 0, - md5 = Md5_2, - identity_md5 = IdenMd5_2, - identity_len = IdenLen + BinSize - }, - hibernate}; + {reply, ok, Stream#stream{ + engine = NewEngine, + written_len = WrittenLen2, + buffer_list = [], + buffer_len = 0, + md5 = Md5_2, + identity_md5 = IdenMd5_2, + identity_len = IdenLen + BinSize + }}; true -> {reply, ok, Stream#stream{ buffer_list = [Bin | Buffer], diff --git a/src/couch/src/couch_util.erl b/src/couch/src/couch_util.erl index d93aaebd6..8ee165666 100644 --- a/src/couch/src/couch_util.erl +++ b/src/couch/src/couch_util.erl @@ -46,6 +46,7 @@ -export([remove_sensitive_data/1]). -export([ejson_to_map/1]). -export([new_set/0, set_from_list/1]). +-export([hibernate_after/1]). -include_lib("couch/include/couch_db.hrl"). @@ -64,6 +65,8 @@ <<"feature_flags">> ]). +-define(DEFAULT_HIBERNATE_AFTER, 5000). + priv_dir() -> case code:priv_dir(couch) of {error, bad_name} -> @@ -806,3 +809,13 @@ new_set() -> set_from_list(KVs) -> sets:from_list(KVs, [{version, 2}]). + +hibernate_after(Module) when is_atom(Module) -> + Key = atom_to_list(Module), + Default = ?DEFAULT_HIBERNATE_AFTER, + case config:get_integer_or_infinity("hibernate_after", Key, Default) of + infinity -> + []; + Timeout when is_integer(Timeout) -> + [{hibernate_after, Timeout}] + end. diff --git a/src/couch/src/couch_work_queue.erl b/src/couch/src/couch_work_queue.erl index 3c6ffeaf8..6ec9d796c 100644 --- a/src/couch/src/couch_work_queue.erl +++ b/src/couch/src/couch_work_queue.erl @@ -34,7 +34,8 @@ }). new(Options) -> - gen_server:start_link(couch_work_queue, Options, []). + GenOpts = couch_util:hibernate_after(?MODULE), + gen_server:start_link(couch_work_queue, Options, GenOpts). queue(Wq, Item) when is_binary(Item) -> gen_server:call(Wq, {queue, Item, byte_size(Item)}, infinity); @@ -73,7 +74,7 @@ init(Options) -> max_size = couch_util:get_value(max_size, Options, nil), max_items = couch_util:get_value(max_items, Options, nil) }, - {ok, Q, hibernate}. + {ok, Q}. terminate(_Reason, #q{worker = undefined}) -> ok; @@ -87,18 +88,13 @@ handle_call({queue, Item, Size}, From, #q{worker = undefined} = Q0) -> items = Q0#q.items + 1, queue = queue:in({Item, Size}, Q0#q.queue) }, - case - (Q#q.size >= Q#q.max_size) orelse - (Q#q.items >= Q#q.max_items) - of - true -> - {noreply, Q#q{blocked = [From | Q#q.blocked]}, hibernate}; - false -> - {reply, ok, Q, hibernate} + case (Q#q.size >= Q#q.max_size) orelse (Q#q.items >= Q#q.max_items) of + true -> {noreply, Q#q{blocked = [From | Q#q.blocked]}}; + false -> {reply, ok, Q} end; handle_call({queue, Item, _}, _From, #q{worker = {W, _Max}} = Q) -> gen_server:reply(W, {ok, [Item]}), - {reply, ok, Q#q{worker = undefined}, hibernate}; + {reply, ok, Q#q{worker = undefined}}; handle_call({dequeue, _Max}, _From, #q{worker = {_, _}}) -> % Something went wrong - the same or a different worker is % trying to dequeue an item. We only allow one worker to wait diff --git a/src/couch/test/eunit/couch_util_tests.erl b/src/couch/test/eunit/couch_util_tests.erl index 5f8f1ce43..ff40a6fdd 100644 --- a/src/couch/test/eunit/couch_util_tests.erl +++ b/src/couch/test/eunit/couch_util_tests.erl @@ -206,3 +206,83 @@ ejson_to_map_test() -> ?assertEqual(#{a => 1, b => 2}, couch_util:ejson_to_map({[{b, 2}, {a, 1}]})), ?assertEqual(#{<<"a">> => [1, #{}]}, couch_util:ejson_to_map({[{<<"a">>, [1, {[]}]}]})), ?assertEqual([#{true => 1}], couch_util:ejson_to_map([{[{true, 1}]}])). + +hibernate_after_test_() -> + { + foreach, + fun setup/0, + fun teardown/1, + [ + ?TDEF_FE(t_hibernate), + ?TDEF_FE(t_do_not_hibernate) + ] + }. + +setup() -> + test_util:start_applications([config]). + +teardown(Ctx) -> + config:delete("hibernate_after", "couch_work_queue", false), + test_util:stop_applications(Ctx). + +t_hibernate(_) -> + ?assertEqual([{hibernate_after, 5000}], hibernate_cfg()), + + % Set non-default value + config:set("hibernate_after", "couch_work_queue", "100", false), + ?assertEqual([{hibernate_after, 100}], hibernate_cfg()), + + {ok, Q} = couch_work_queue:new([]), + % Enqueue item, creating gen_server activity + ok = couch_work_queue:queue(Q, potato), + + % Assert that we eventually hibernate + wait_hibernate(Q), + {current_function, {_, F, _}} = process_info(Q, current_function), + % Note: different versions of OTP hibernate at a different function + ?assert(hibernate == F orelse loop_hibernate == F), + + % We can wait wake up from hibernation and get back to work + ?assertEqual({ok, [potato]}, couch_work_queue:dequeue(Q)), + + % Then we can get back into hibernatation + wait_hibernate(Q), + {current_function, {_, F, _}} = process_info(Q, current_function), + ?assert(hibernate == F orelse loop_hibernate == F), + + couch_work_queue:close(Q). + +t_do_not_hibernate(_) -> + config:set("hibernate_after", "couch_work_queue", "infinity", false), + ?assertEqual([], hibernate_cfg()), + + {ok, Q} = couch_work_queue:new([]), + + % Enqueue item, creating some gen_server activity + ok = couch_work_queue:queue(Q, potato), + + % Assert that we do not hibernate + timer:sleep(200), + {current_function, {_, F, _}} = process_info(Q, current_function), + ?assertNot(hibernate == F orelse loop_hibernate == F), + + % The queue works as expected without hibernation + ?assertEqual({ok, [potato]}, couch_work_queue:dequeue(Q)), + + couch_work_queue:close(Q). + +% Helper functions + +hibernate_cfg() -> + couch_util:hibernate_after(couch_work_queue). + +wait_hibernate(Pid) -> + WaitFun = fun() -> + {current_function, {M, F, _}} = process_info(Pid, current_function), + case {M, F} of + {erlang, hibernate} -> ok; + {gen_server, loop_hibernate} -> ok; + {_, _} -> wait + end + end, + test_util:wait(WaitFun). diff --git a/src/rexi/src/rexi_buffer.erl b/src/rexi/src/rexi_buffer.erl index 73bf3dfae..90f8bf2e6 100644 --- a/src/rexi/src/rexi_buffer.erl +++ b/src/rexi/src/rexi_buffer.erl @@ -42,7 +42,8 @@ }). start_link(ServerId) -> - gen_server:start_link({local, ServerId}, ?MODULE, [ServerId], []). + GenOpts = couch_util:hibernate_after(?MODULE), + gen_server:start_link({local, ServerId}, ?MODULE, [ServerId], GenOpts). send(Dest, Msg) -> Server = list_to_atom(lists:concat([rexi_buffer, "_", get_node(Dest)])), @@ -90,9 +91,10 @@ handle_info(timeout, #state{sender = nil, count = C} = State) when C > 0 -> counters:add(Counter, 1, -1), case erlang:send(Dest, Msg, [noconnect, nosuspend]) of ok when C =:= 1 -> - % We just sent the last queued messsage, we'll use this opportunity - % to hibernate the process and run a garbage collection - {noreply, NewState, hibernate}; + % We just sent the last queued messsage. If we stay idle for a + % while, as configured via couch_util:hibernate_after/2 we'll go + % into hibernation. + {noreply, NewState}; ok when C > 1 -> % Use a zero timeout to recurse into this handler ASAP {noreply, NewState, 0};
