This is an automated email from the ASF dual-hosted git repository. vatamane pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/couchdb.git
The following commit(s) were added to refs/heads/master by this push: new b0f673f In _scheduler/docs fix `crashing` state showing as `pending` sometimes b0f673f is described below commit b0f673fb51bf521f96729499e939e29f0c58fe8c Author: Nick Vatamaniuc <vatam...@apache.org> AuthorDate: Tue Apr 10 10:31:53 2018 -0400 In _scheduler/docs fix `crashing` state showing as `pending` sometimes Replication jobs are backed off based on the number of consecutive crashes, that is, we count the number of crashes in a row and then penalize jobs with an exponential wait based that number. After a job runs without crashing for 2 minutes, we consider it healthy and stop going back in its history and looking for crashes. Previously a job's state was set to `crashing` only if there were any consecutive errors. So it could have ran for 3 minutes, then user deletes the source database, job crashes and stops. Until it runs again the state would have been shown as `pending`. For internal accounting purposes that's correct but it is confusing for the user because the last event in its history is a crash. This commit makes sure that if the last even in job's history is a crash user will see the jobs as `crashing` with the respective crash reason. The scheduling algorithm didn't change. Fixes #1276 --- .../src/couch_replicator_scheduler.erl | 82 ++++++++++++++++++++-- 1 file changed, 78 insertions(+), 4 deletions(-) diff --git a/src/couch_replicator/src/couch_replicator_scheduler.erl b/src/couch_replicator/src/couch_replicator_scheduler.erl index 0b39634..50896c5 100644 --- a/src/couch_replicator/src/couch_replicator_scheduler.erl +++ b/src/couch_replicator/src/couch_replicator_scheduler.erl @@ -138,11 +138,15 @@ job_summary(JobId, HealthThreshold) -> ErrorCount = consecutive_crashes(History, HealthThreshold), {State, Info} = case {Pid, ErrorCount} of {undefined, 0} -> - {pending, null}; + case History of + [{{crashed, Error}, _When} | _] -> + {crashing, crash_reason_json(Error)}; + [_ | _] -> + {pending, null} + end; {undefined, ErrorCount} when ErrorCount > 0 -> [{{crashed, Error}, _When} | _] = History, - ErrMsg = couch_replicator_utils:rep_error_to_binary(Error), - {crashing, ErrMsg}; + {crashing, crash_reason_json(Error)}; {Pid, ErrorCount} when is_pid(Pid) -> {running, null} end, @@ -1021,7 +1025,11 @@ scheduler_test_() -> t_oneshot_will_hog_the_scheduler(), t_if_excess_is_trimmed_rotation_doesnt_happen(), t_if_transient_job_crashes_it_gets_removed(), - t_if_permanent_job_crashes_it_stays_in_ets() + t_if_permanent_job_crashes_it_stays_in_ets(), + t_job_summary_running(), + t_job_summary_pending(), + t_job_summary_crashing_once(), + t_job_summary_crashing_many_times() ] }. @@ -1300,6 +1308,72 @@ t_if_permanent_job_crashes_it_stays_in_ets() -> end). +t_job_summary_running() -> + ?_test(begin + Job = #job{ + id = job1, + pid = mock_pid(), + history = [added()], + rep = #rep{ + db_name = <<"db1">>, + source = <<"s">>, + target = <<"t">> + } + }, + setup_jobs([Job]), + Summary = job_summary(job1, ?DEFAULT_HEALTH_THRESHOLD_SEC), + ?assertEqual(running, proplists:get_value(state, Summary)), + ?assertEqual(null, proplists:get_value(info, Summary)), + ?assertEqual(0, proplists:get_value(error_count, Summary)) + end). + + +t_job_summary_pending() -> + ?_test(begin + Job = #job{ + id = job1, + pid = undefined, + history = [stopped(20), started(10), added()], + rep = #rep{source = <<"s">>, target = <<"t">>} + }, + setup_jobs([Job]), + Summary = job_summary(job1, ?DEFAULT_HEALTH_THRESHOLD_SEC), + ?assertEqual(pending, proplists:get_value(state, Summary)), + ?assertEqual(null, proplists:get_value(info, Summary)), + ?assertEqual(0, proplists:get_value(error_count, Summary)) + end). + + +t_job_summary_crashing_once() -> + ?_test(begin + Job = #job{ + id = job1, + history = [crashed(?DEFAULT_HEALTH_THRESHOLD_SEC + 1), started(0)], + rep = #rep{source = <<"s">>, target = <<"t">>} + }, + setup_jobs([Job]), + Summary = job_summary(job1, ?DEFAULT_HEALTH_THRESHOLD_SEC), + ?assertEqual(crashing, proplists:get_value(state, Summary)), + ?assertEqual(<<"some_reason">>, proplists:get_value(info, Summary)), + ?assertEqual(0, proplists:get_value(error_count, Summary)) + end). + + +t_job_summary_crashing_many_times() -> + ?_test(begin + Job = #job{ + id = job1, + history = [crashed(4), started(3), crashed(2), started(1)], + rep = #rep{source = <<"s">>, target = <<"t">>} + }, + setup_jobs([Job]), + Summary = job_summary(job1, ?DEFAULT_HEALTH_THRESHOLD_SEC), + ?assertEqual(crashing, proplists:get_value(state, Summary)), + ?assertEqual(<<"some_reason">>, proplists:get_value(info, Summary)), + ?assertEqual(2, proplists:get_value(error_count, Summary)) + end). + + % Test helper functions setup() -> -- To stop receiving notification emails like this one, please contact vatam...@apache.org.