[couchdb] 12/13: feat: allow restarting of failed jobs

ronny Sun, 23 Jul 2023 05:43:09 -0700

This is an automated email from the ASF dual-hosted git repository.

ronny pushed a commit to branch nouveau4win
in repository https://gitbox.apache.org/repos/asf/couchdb.git


commit ba41a23eca7ed01016fa3e6cc0122d4d7c6e68d8
Author: Jan Lehnardt <[email protected]>
AuthorDate: Wed Jul 19 14:07:12 2023 +0200

    feat: allow restarting of failed jobs
---
 src/mem3/src/mem3_reshard.erl                 |  2 +-
 src/mem3/test/eunit/mem3_reshard_api_test.erl | 32 +++++++++++++++++++++++++++
 2 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/src/mem3/src/mem3_reshard.erl b/src/mem3/src/mem3_reshard.erl
index ec08c72cd..b3bed69ff 100644
--- a/src/mem3/src/mem3_reshard.erl
+++ b/src/mem3/src/mem3_reshard.erl
@@ -266,7 +266,7 @@ handle_call({resume_job, _}, _From, #state{state = stopped} 
= State) ->
 handle_call({resume_job, Id}, _From, State) ->
     couch_log:notice("~p resume_job call ~p", [?MODULE, Id]),
     case job_by_id(Id) of
-        #job{job_state = stopped} = Job ->
+        #job{job_state = JobState} = Job when JobState == stopped; JobState == 
failed ->
             case start_job_int(Job, State) of
                 ok ->
                     {reply, ok, State};
diff --git a/src/mem3/test/eunit/mem3_reshard_api_test.erl 
b/src/mem3/test/eunit/mem3_reshard_api_test.erl
index 6cbc4dba8..59ec90553 100644
--- a/src/mem3/test/eunit/mem3_reshard_api_test.erl
+++ b/src/mem3/test/eunit/mem3_reshard_api_test.erl
@@ -50,6 +50,8 @@ teardown({Url, {Db1, Db2, Db3}}) ->
     ok = config:delete("reshard", "max_jobs", Persist),
     ok = config:delete("reshard", "require_node_param", Persist),
     ok = config:delete("reshard", "require_range_param", Persist),
+    ok = config:delete("reshard", "max_retries", Persist),
+    ok = config:delete("reshard", "retry_interval_sec", Persist),
     ok = config:delete("admins", ?USER, Persist),
     meck:unload().
 
@@ -79,6 +81,7 @@ mem3_reshard_api_test_() ->
                     fun test_disabled/1,
                     fun start_stop_cluster_with_a_job/1,
                     fun individual_job_start_stop/1,
+                    fun individual_job_start_after_failure/1,
                     fun individual_job_stop_when_cluster_stopped/1,
                     fun create_job_with_invalid_arguments/1,
                     fun create_job_with_db/1,
@@ -417,6 +420,35 @@ individual_job_start_stop({Top, {Db1, _, _}}) ->
             wait_state(StUrl, <<"completed">>)
         end)}.
 
+individual_job_start_after_failure({Top, {Db1, _, _}}) ->
+    {timeout, ?TIMEOUT,
+        ?_test(begin
+            config:set("reshard", "retry_interval_sec", "0", false),
+            config:set("reshard", "max_retries", "1", false),
+            meck:expect(couch_db_split, split, fun(_, _, _) ->
+                meck:exception(error, kapow)
+            end),
+
+            Body = #{type => split, db => Db1},
+            {201, [#{?ID := Id}]} = req(post, Top ++ ?JOBS, Body),
+
+            JobUrl = Top ++ ?JOBS ++ ?b2l(Id),
+            StUrl = JobUrl ++ "/state",
+
+            wait_state(StUrl, <<"failed">>),
+
+            % Stop/start resharding globally and job should still stay failed
+            ?assertMatch({200, _}, req(put, Top ++ ?STATE, #{state => 
stopped})),
+            ?assertMatch({200, _}, req(put, Top ++ ?STATE, #{state => 
running})),
+            ?assertMatch({200, #{<<"state">> := <<"failed">>}}, req(get, 
StUrl)),
+
+            meck:unload(),
+
+            % Start the job again
+            ?assertMatch({200, _}, req(put, StUrl, #{state => running})),
+            wait_state(StUrl, <<"completed">>)
+        end)}.
+
 individual_job_stop_when_cluster_stopped({Top, {Db1, _, _}}) ->
     {timeout, ?TIMEOUT,
         ?_test(begin

[couchdb] 12/13: feat: allow restarting of failed jobs

Reply via email to