This is an automated email from the ASF dual-hosted git repository. vatamane pushed a commit to branch add-limit-to-replicator-job-stop in repository https://gitbox.apache.org/repos/asf/couchdb.git
commit 7b0ea8acd2c136b41456d3afd856f635f3edc000 Author: Nick Vatamaniuc <[email protected]> AuthorDate: Thu Jul 3 13:41:36 2025 -0400 Don't wait indefinitely for replication jobs to stop Previously we used `gen_server:stop/3` with an infinity timeout. We have observed that it's possible for jobs to be stuck waiting for network requests so they may take indefinitely to process the shutdown request (and call their `terminate/2` callback) and that can block the replicator scheduler. To fix it add a 5 second timeout to the stop call and then forceably kill the process. --- src/couch_replicator/src/couch_replicator_scheduler_job.erl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/couch_replicator/src/couch_replicator_scheduler_job.erl b/src/couch_replicator/src/couch_replicator_scheduler_job.erl index 544c5602a..7f123441f 100644 --- a/src/couch_replicator/src/couch_replicator_scheduler_job.erl +++ b/src/couch_replicator/src/couch_replicator_scheduler_job.erl @@ -47,6 +47,7 @@ -define(LOWEST_SEQ, 0). -define(DEFAULT_CHECKPOINT_INTERVAL, 30000). -define(STARTUP_JITTER_DEFAULT, 5000). +-define(STOP_TIMEOUT_MSEC, 5000). -record(rep_state, { rep_details, @@ -110,7 +111,8 @@ stop(Pid) when is_pid(Pid) -> % won't return ok but exit the calling process, usually the scheduler, so % we guard against that. See: % www.erlang.org/doc/apps/stdlib/gen_server.html#stop/3 - catch gen_server:stop(Pid, shutdown, infinity), + catch gen_server:stop(Pid, shutdown, ?STOP_TIMEOUT_MSEC), + exit(Pid, kill), receive {'DOWN', Ref, _, _, Reason} -> Reason end,
