This is an automated email from the ASF dual-hosted git repository. vatamane pushed a commit to branch change-hibernation-strategy in repository https://gitbox.apache.org/repos/asf/couchdb.git
commit 74aed00c70993afb5216a3cab8cca58984ef067f Author: Nick Vatamaniuc <[email protected]> AuthorDate: Tue Feb 17 18:08:03 2026 -0500 Switch to hibernate_after When debugging processes getting stuck in hibernation bug [1] a few benchmarks showed that hibernation can be pretty expensive. I saw 20% or so reduction in latency in couch_work_queue if we hibernate after every single item insertion. Erlang documents warn about this [2]: > Use this feature with care, as hibernation implies at least two garbage collections (when hibernating and shortly after waking up) and is not something you want to do between each call to a busy server. In a few places like the `couch_work_queue` and `couch_db_updater` we did exactly that. However, since we added that more Erlang/OTP implemented a new `gen_server` option - `{hibernate_after, Timeout}`. It will trigger hibernation after an idle time. That seems ideal for us - it keeps expensive hibernation out of the main data path, as docs warn us about, but once the server goes idle we still get to run it to dereference any ref binaries. Since we encountered the recent hibernation bug [1] also add an option to disable it altogether, just to have a way to mitigate the issue when running on OTP 27 and 28 before the fix is out. [1] https://github.com/erlang/otp/issues/10651 [2] https://www.erlang.org/doc/apps/stdlib/gen_server.html [3] https://github.com/apache/couchdb/commit/d9eb87f60ac8328afcca25cf6c7aae53b395b089 --- rel/overlay/etc/default.ini | 17 +++++++ src/couch/src/couch_db_updater.erl | 8 +-- src/couch/src/couch_stream.erl | 24 ++++----- src/couch/src/couch_util.erl | 10 ++++ src/couch/src/couch_work_queue.erl | 20 ++++---- src/couch/test/eunit/couch_util_tests.erl | 81 +++++++++++++++++++++++++++++++ src/rexi/src/rexi_buffer.erl | 11 +++-- 7 files changed, 141 insertions(+), 30 deletions(-) diff --git a/rel/overlay/etc/default.ini b/rel/overlay/etc/default.ini index d599656b1..34d902118 100644 --- a/rel/overlay/etc/default.ini +++ b/rel/overlay/etc/default.ini @@ -1270,3 +1270,20 @@ url = {{nouveau_url}} ; periodically reload configuration from file. ; Set to infinity to disable. ;auto_reload_secs = infinity + +[hibernate_after] +; Some processes which handle a large number of referenced binaries can benefit +; from hibernating periodically, so they can run a complete garbage collection +; and dereference those binaries. This section configures idle hibernation +; timeouts for some of those processes. The value is milliseconds or +; "infinity". Setting the value to "infinity" disables hibernation. The option +; may be used as a mitigation strategy when running with earlier OTP 27/28 +; versionss which had a bug [1] which prevented processes from waking up from +; hibernation. +; +; [1] https://github.com/erlang/otp/issues/10651 + +;rexi_buffer = 1000 +;couch_stream = 1000 +;couch_db_updater = 1000 +;couch_work_queue = 1000 diff --git a/src/couch/src/couch_db_updater.erl b/src/couch/src/couch_db_updater.erl index 8394cd5a3..b4b42001c 100644 --- a/src/couch/src/couch_db_updater.erl +++ b/src/couch/src/couch_db_updater.erl @@ -21,6 +21,7 @@ % 10 GiB -define(DEFAULT_MAX_PARTITION_SIZE, 16#280000000). +-define(DEFAULT_HIBERNATE_AFTER, 1000). -record(merge_acc, { revs_limit, @@ -49,7 +50,8 @@ init({Engine, DbName, FilePath, Options0}) -> % couch_db:validate_doc_update, which loads them lazily. NewDb = Db#db{main_pid = self()}, proc_lib:init_ack({ok, NewDb}), - gen_server:enter_loop(?MODULE, [], NewDb) + GenOpts = couch_util:hibernate_after(?MODULE, ?DEFAULT_HIBERNATE_AFTER), + gen_server:enter_loop(?MODULE, GenOpts, NewDb) catch throw:InitError -> proc_lib:init_ack(InitError) @@ -220,11 +222,11 @@ handle_info( false -> Db2 end, - {noreply, Db3, hibernate} + {noreply, Db3} catch throw:retry -> [catch (ClientPid ! {retry, self()}) || ClientPid <- Clients], - {noreply, Db, hibernate} + {noreply, Db} end; handle_info({'EXIT', _Pid, normal}, Db) -> {noreply, Db}; diff --git a/src/couch/src/couch_stream.erl b/src/couch/src/couch_stream.erl index 0e4ccdae6..1b6787c1b 100644 --- a/src/couch/src/couch_stream.erl +++ b/src/couch/src/couch_stream.erl @@ -36,6 +36,7 @@ ]). -define(DEFAULT_BUFFER_SIZE, 4096). +-define(DEFAULT_HIBERNATE_AFTER, 1000). -record(stream, { engine, @@ -58,7 +59,8 @@ open({_StreamEngine, _StreamEngineState} = Engine) -> open(Engine, []). open({_StreamEngine, _StreamEngineState} = Engine, Options) -> - gen_server:start_link(?MODULE, {Engine, self(), erlang:get(io_priority), Options}, []). + GenOpts = couch_util:hibernate_after(?MODULE, ?DEFAULT_HIBERNATE_AFTER), + gen_server:start_link(?MODULE, {Engine, self(), erlang:get(io_priority), Options}, GenOpts). close(Pid) -> gen_server:call(Pid, close, infinity). @@ -223,17 +225,15 @@ handle_call({write, Bin}, _From, Stream) -> Md5_2 = couch_hash:md5_hash_update(Md5, WriteBin2) end, - {reply, ok, - Stream#stream{ - engine = NewEngine, - written_len = WrittenLen2, - buffer_list = [], - buffer_len = 0, - md5 = Md5_2, - identity_md5 = IdenMd5_2, - identity_len = IdenLen + BinSize - }, - hibernate}; + {reply, ok, Stream#stream{ + engine = NewEngine, + written_len = WrittenLen2, + buffer_list = [], + buffer_len = 0, + md5 = Md5_2, + identity_md5 = IdenMd5_2, + identity_len = IdenLen + BinSize + }}; true -> {reply, ok, Stream#stream{ buffer_list = [Bin | Buffer], diff --git a/src/couch/src/couch_util.erl b/src/couch/src/couch_util.erl index d93aaebd6..09ca71e1e 100644 --- a/src/couch/src/couch_util.erl +++ b/src/couch/src/couch_util.erl @@ -46,6 +46,7 @@ -export([remove_sensitive_data/1]). -export([ejson_to_map/1]). -export([new_set/0, set_from_list/1]). +-export([hibernate_after/2]). -include_lib("couch/include/couch_db.hrl"). @@ -806,3 +807,12 @@ new_set() -> set_from_list(KVs) -> sets:from_list(KVs, [{version, 2}]). + +hibernate_after(Module, Default) when is_atom(Module) -> + Key = atom_to_list(Module), + case config:get_integer_or_infinity("hibernate_after", Key, Default) of + infinity -> + []; + Timeout when is_integer(Timeout) -> + [{hibernate_after, Timeout}] + end. diff --git a/src/couch/src/couch_work_queue.erl b/src/couch/src/couch_work_queue.erl index 3c6ffeaf8..0285ad671 100644 --- a/src/couch/src/couch_work_queue.erl +++ b/src/couch/src/couch_work_queue.erl @@ -22,6 +22,8 @@ -export([init/1, terminate/2]). -export([handle_call/3, handle_cast/2, handle_info/2]). +-define(DEFAULT_HIBERNATE_AFTER, 1000). + -record(q, { queue = queue:new(), blocked = [], @@ -34,7 +36,8 @@ }). new(Options) -> - gen_server:start_link(couch_work_queue, Options, []). + GenOpts = couch_util:hibernate_after(?MODULE, ?DEFAULT_HIBERNATE_AFTER), + gen_server:start_link(couch_work_queue, Options, GenOpts). queue(Wq, Item) when is_binary(Item) -> gen_server:call(Wq, {queue, Item, byte_size(Item)}, infinity); @@ -73,7 +76,7 @@ init(Options) -> max_size = couch_util:get_value(max_size, Options, nil), max_items = couch_util:get_value(max_items, Options, nil) }, - {ok, Q, hibernate}. + {ok, Q}. terminate(_Reason, #q{worker = undefined}) -> ok; @@ -87,18 +90,13 @@ handle_call({queue, Item, Size}, From, #q{worker = undefined} = Q0) -> items = Q0#q.items + 1, queue = queue:in({Item, Size}, Q0#q.queue) }, - case - (Q#q.size >= Q#q.max_size) orelse - (Q#q.items >= Q#q.max_items) - of - true -> - {noreply, Q#q{blocked = [From | Q#q.blocked]}, hibernate}; - false -> - {reply, ok, Q, hibernate} + case (Q#q.size >= Q#q.max_size) orelse (Q#q.items >= Q#q.max_items) of + true -> {noreply, Q#q{blocked = [From | Q#q.blocked]}}; + false -> {reply, ok, Q} end; handle_call({queue, Item, _}, _From, #q{worker = {W, _Max}} = Q) -> gen_server:reply(W, {ok, [Item]}), - {reply, ok, Q#q{worker = undefined}, hibernate}; + {reply, ok, Q#q{worker = undefined}}; handle_call({dequeue, _Max}, _From, #q{worker = {_, _}}) -> % Something went wrong - the same or a different worker is % trying to dequeue an item. We only allow one worker to wait diff --git a/src/couch/test/eunit/couch_util_tests.erl b/src/couch/test/eunit/couch_util_tests.erl index 5f8f1ce43..a7d870d5e 100644 --- a/src/couch/test/eunit/couch_util_tests.erl +++ b/src/couch/test/eunit/couch_util_tests.erl @@ -206,3 +206,84 @@ ejson_to_map_test() -> ?assertEqual(#{a => 1, b => 2}, couch_util:ejson_to_map({[{b, 2}, {a, 1}]})), ?assertEqual(#{<<"a">> => [1, #{}]}, couch_util:ejson_to_map({[{<<"a">>, [1, {[]}]}]})), ?assertEqual([#{true => 1}], couch_util:ejson_to_map([{[{true, 1}]}])). + +hibernate_after_test_() -> + { + foreach, + fun setup/0, + fun teardown/1, + [ + ?TDEF_FE(t_hibernate), + ?TDEF_FE(t_do_not_hibernate) + ] + }. + +setup() -> + test_util:start_applications([config]). + +teardown(Ctx) -> + config:delete("hibernate_after", "couch_work_queue", false), + test_util:stop_applications(Ctx). + +t_hibernate(_) -> + ?assertEqual([{hibernate_after, 999}], hibernate_cfg(999)), + config:set("hibernate_after", "couch_work_queue", "10", false), + ?assertEqual([{hibernate_after, 10}], hibernate_cfg(10)), + + {ok, Q} = couch_work_queue:new([]), + + % Enqueue item, creating gen_server activity + ok = couch_work_queue:queue(Q, potato), + + % Assert that we eventually hibernate + wait_hibernate(Q), + {current_function, {M, F, _}} = process_info(Q, current_function), + ?assertEqual({erlang, hibernate}, {M, F}), + + % We can wait wake up from hibernation and get back to work + ?assertEqual({ok, [potato]}, couch_work_queue:dequeue(Q)), + + % Then we can get back into hibernatation + wait_hibernate(Q), + {current_function, {M, F, _}} = process_info(Q, current_function), + ?assertEqual({erlang, hibernate}, {M, F}), + + couch_work_queue:close(Q). + +t_do_not_hibernate(_) -> + ?assertEqual([], hibernate_cfg(infinity)), + config:set("hibernate_after", "couch_work_queue", "10", false), + ?assertEqual([{hibernate_after, 10}], hibernate_cfg(infinity)), + config:set("hibernate_after", "couch_work_queue", "infinity", false), + ?assertEqual([], hibernate_cfg(10)), + + {ok, Q} = couch_work_queue:new([]), + + % Enqueue item, creating gen_server activity + ok = couch_work_queue:queue(Q, potato), + + % Assert that we do not hibernate + timer:sleep(100), + {current_function, {M, F, _}} = process_info(Q, current_function), + ?assertNotEqual({erlang, hibernate}, {M, F}), + + % The queue works as expected without hibernation + ?assertEqual({ok, [potato]}, couch_work_queue:dequeue(Q)), + + couch_work_queue:close(Q). + + +% Helper functions + +hibernate_cfg(Default) -> + couch_util:hibernate_after(couch_work_queue, Default). + +wait_hibernate(Pid) -> + WaitFun = fun() -> + {current_function, {M, F, _}} = process_info(Pid, current_function), + case {M, F} of + {erlang, hibernate} -> ok; + {_, _} -> wait + end + end, + test_util:wait(WaitFun). diff --git a/src/rexi/src/rexi_buffer.erl b/src/rexi/src/rexi_buffer.erl index 73bf3dfae..c4b0c2360 100644 --- a/src/rexi/src/rexi_buffer.erl +++ b/src/rexi/src/rexi_buffer.erl @@ -31,6 +31,7 @@ -define(BUFFER_COUNT_DEFAULT, 2000). -define(COUNTER, counter). +-define(DEFAULT_HIBERNATE_AFTER, 1000). -record(state, { server_id, @@ -42,7 +43,8 @@ }). start_link(ServerId) -> - gen_server:start_link({local, ServerId}, ?MODULE, [ServerId], []). + GenOpts = couch_util:hibernate_after(?MODULE, ?DEFAULT_HIBERNATE_AFTER), + gen_server:start_link({local, ServerId}, ?MODULE, [ServerId], GenOpts). send(Dest, Msg) -> Server = list_to_atom(lists:concat([rexi_buffer, "_", get_node(Dest)])), @@ -90,9 +92,10 @@ handle_info(timeout, #state{sender = nil, count = C} = State) when C > 0 -> counters:add(Counter, 1, -1), case erlang:send(Dest, Msg, [noconnect, nosuspend]) of ok when C =:= 1 -> - % We just sent the last queued messsage, we'll use this opportunity - % to hibernate the process and run a garbage collection - {noreply, NewState, hibernate}; + % We just sent the last queued messsage. If we stay idle for a + % while, as configured via couch_util:hibernate_after/2 we'll go + % into hibernation. + {noreply, NewState}; ok when C > 1 -> % Use a zero timeout to recurse into this handler ASAP {noreply, NewState, 0};
