This is an automated email from the ASF dual-hosted git repository. ronny pushed a commit to branch nouveau4win in repository https://gitbox.apache.org/repos/asf/couchdb.git
commit ba41a23eca7ed01016fa3e6cc0122d4d7c6e68d8 Author: Jan Lehnardt <[email protected]> AuthorDate: Wed Jul 19 14:07:12 2023 +0200 feat: allow restarting of failed jobs --- src/mem3/src/mem3_reshard.erl | 2 +- src/mem3/test/eunit/mem3_reshard_api_test.erl | 32 +++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/src/mem3/src/mem3_reshard.erl b/src/mem3/src/mem3_reshard.erl index ec08c72cd..b3bed69ff 100644 --- a/src/mem3/src/mem3_reshard.erl +++ b/src/mem3/src/mem3_reshard.erl @@ -266,7 +266,7 @@ handle_call({resume_job, _}, _From, #state{state = stopped} = State) -> handle_call({resume_job, Id}, _From, State) -> couch_log:notice("~p resume_job call ~p", [?MODULE, Id]), case job_by_id(Id) of - #job{job_state = stopped} = Job -> + #job{job_state = JobState} = Job when JobState == stopped; JobState == failed -> case start_job_int(Job, State) of ok -> {reply, ok, State}; diff --git a/src/mem3/test/eunit/mem3_reshard_api_test.erl b/src/mem3/test/eunit/mem3_reshard_api_test.erl index 6cbc4dba8..59ec90553 100644 --- a/src/mem3/test/eunit/mem3_reshard_api_test.erl +++ b/src/mem3/test/eunit/mem3_reshard_api_test.erl @@ -50,6 +50,8 @@ teardown({Url, {Db1, Db2, Db3}}) -> ok = config:delete("reshard", "max_jobs", Persist), ok = config:delete("reshard", "require_node_param", Persist), ok = config:delete("reshard", "require_range_param", Persist), + ok = config:delete("reshard", "max_retries", Persist), + ok = config:delete("reshard", "retry_interval_sec", Persist), ok = config:delete("admins", ?USER, Persist), meck:unload(). @@ -79,6 +81,7 @@ mem3_reshard_api_test_() -> fun test_disabled/1, fun start_stop_cluster_with_a_job/1, fun individual_job_start_stop/1, + fun individual_job_start_after_failure/1, fun individual_job_stop_when_cluster_stopped/1, fun create_job_with_invalid_arguments/1, fun create_job_with_db/1, @@ -417,6 +420,35 @@ individual_job_start_stop({Top, {Db1, _, _}}) -> wait_state(StUrl, <<"completed">>) end)}. +individual_job_start_after_failure({Top, {Db1, _, _}}) -> + {timeout, ?TIMEOUT, + ?_test(begin + config:set("reshard", "retry_interval_sec", "0", false), + config:set("reshard", "max_retries", "1", false), + meck:expect(couch_db_split, split, fun(_, _, _) -> + meck:exception(error, kapow) + end), + + Body = #{type => split, db => Db1}, + {201, [#{?ID := Id}]} = req(post, Top ++ ?JOBS, Body), + + JobUrl = Top ++ ?JOBS ++ ?b2l(Id), + StUrl = JobUrl ++ "/state", + + wait_state(StUrl, <<"failed">>), + + % Stop/start resharding globally and job should still stay failed + ?assertMatch({200, _}, req(put, Top ++ ?STATE, #{state => stopped})), + ?assertMatch({200, _}, req(put, Top ++ ?STATE, #{state => running})), + ?assertMatch({200, #{<<"state">> := <<"failed">>}}, req(get, StUrl)), + + meck:unload(), + + % Start the job again + ?assertMatch({200, _}, req(put, StUrl, #{state => running})), + wait_state(StUrl, <<"completed">>) + end)}. + individual_job_stop_when_cluster_stopped({Top, {Db1, _, _}}) -> {timeout, ?TIMEOUT, ?_test(begin
