This is an automated email from the ASF dual-hosted git repository.

vatamane pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/couchdb.git


The following commit(s) were added to refs/heads/master by this push:
     new b0f673f  In _scheduler/docs fix `crashing` state showing as `pending` 
sometimes
b0f673f is described below

commit b0f673fb51bf521f96729499e939e29f0c58fe8c
Author: Nick Vatamaniuc <vatam...@apache.org>
AuthorDate: Tue Apr 10 10:31:53 2018 -0400

    In _scheduler/docs fix `crashing` state showing as `pending` sometimes
    
    Replication jobs are backed off based on the number of consecutive crashes,
    that is, we count the number of crashes in a row and then penalize jobs 
with an
    exponential wait based that number. After a job runs without crashing for 2
    minutes, we consider it healthy and stop going back in its history and 
looking
    for crashes.
    
    Previously a job's state was set to `crashing` only if there were any
    consecutive errors. So it could have ran for 3 minutes, then user deletes 
the
    source database, job crashes and stops. Until it runs again the state would
    have been shown as `pending`. For internal accounting purposes that's 
correct
    but it is confusing for the user because the last event in its history is a
    crash.
    
    This commit makes sure that if the last even in job's history is a crash 
user
    will see the jobs as `crashing` with the respective crash reason. The
    scheduling algorithm didn't change.
    
    Fixes #1276
---
 .../src/couch_replicator_scheduler.erl             | 82 ++++++++++++++++++++--
 1 file changed, 78 insertions(+), 4 deletions(-)

diff --git a/src/couch_replicator/src/couch_replicator_scheduler.erl 
b/src/couch_replicator/src/couch_replicator_scheduler.erl
index 0b39634..50896c5 100644
--- a/src/couch_replicator/src/couch_replicator_scheduler.erl
+++ b/src/couch_replicator/src/couch_replicator_scheduler.erl
@@ -138,11 +138,15 @@ job_summary(JobId, HealthThreshold) ->
             ErrorCount = consecutive_crashes(History, HealthThreshold),
             {State, Info} = case {Pid, ErrorCount} of
                 {undefined, 0}  ->
-                    {pending, null};
+                    case History of
+                        [{{crashed, Error}, _When} | _] ->
+                            {crashing, crash_reason_json(Error)};
+                        [_ | _] ->
+                            {pending, null}
+                    end;
                 {undefined, ErrorCount} when ErrorCount > 0 ->
                      [{{crashed, Error}, _When} | _] = History,
-                     ErrMsg = 
couch_replicator_utils:rep_error_to_binary(Error),
-                     {crashing, ErrMsg};
+                     {crashing, crash_reason_json(Error)};
                 {Pid, ErrorCount} when is_pid(Pid) ->
                      {running, null}
             end,
@@ -1021,7 +1025,11 @@ scheduler_test_() ->
             t_oneshot_will_hog_the_scheduler(),
             t_if_excess_is_trimmed_rotation_doesnt_happen(),
             t_if_transient_job_crashes_it_gets_removed(),
-            t_if_permanent_job_crashes_it_stays_in_ets()
+            t_if_permanent_job_crashes_it_stays_in_ets(),
+            t_job_summary_running(),
+            t_job_summary_pending(),
+            t_job_summary_crashing_once(),
+            t_job_summary_crashing_many_times()
          ]
     }.
 
@@ -1300,6 +1308,72 @@ t_if_permanent_job_crashes_it_stays_in_ets() ->
    end).
 
 
+t_job_summary_running() ->
+    ?_test(begin
+        Job =  #job{
+            id = job1,
+            pid = mock_pid(),
+            history = [added()],
+            rep = #rep{
+                db_name = <<"db1">>,
+                source = <<"s">>,
+                target = <<"t">>
+            }
+        },
+        setup_jobs([Job]),
+        Summary = job_summary(job1, ?DEFAULT_HEALTH_THRESHOLD_SEC),
+        ?assertEqual(running, proplists:get_value(state, Summary)),
+        ?assertEqual(null, proplists:get_value(info, Summary)),
+        ?assertEqual(0, proplists:get_value(error_count, Summary))
+    end).
+
+
+t_job_summary_pending() ->
+    ?_test(begin
+        Job =  #job{
+            id = job1,
+            pid = undefined,
+            history = [stopped(20), started(10), added()],
+            rep = #rep{source = <<"s">>, target = <<"t">>}
+        },
+        setup_jobs([Job]),
+        Summary = job_summary(job1, ?DEFAULT_HEALTH_THRESHOLD_SEC),
+        ?assertEqual(pending, proplists:get_value(state, Summary)),
+        ?assertEqual(null, proplists:get_value(info, Summary)),
+        ?assertEqual(0, proplists:get_value(error_count, Summary))
+    end).
+
+
+t_job_summary_crashing_once() ->
+    ?_test(begin
+        Job =  #job{
+            id = job1,
+            history = [crashed(?DEFAULT_HEALTH_THRESHOLD_SEC + 1), started(0)],
+            rep = #rep{source = <<"s">>, target = <<"t">>}
+        },
+        setup_jobs([Job]),
+        Summary = job_summary(job1, ?DEFAULT_HEALTH_THRESHOLD_SEC),
+        ?assertEqual(crashing, proplists:get_value(state, Summary)),
+        ?assertEqual(<<"some_reason">>, proplists:get_value(info, Summary)),
+        ?assertEqual(0, proplists:get_value(error_count, Summary))
+    end).
+
+
+t_job_summary_crashing_many_times() ->
+    ?_test(begin
+        Job =  #job{
+            id = job1,
+            history = [crashed(4), started(3), crashed(2), started(1)],
+            rep = #rep{source = <<"s">>, target = <<"t">>}
+        },
+        setup_jobs([Job]),
+        Summary = job_summary(job1, ?DEFAULT_HEALTH_THRESHOLD_SEC),
+        ?assertEqual(crashing, proplists:get_value(state, Summary)),
+        ?assertEqual(<<"some_reason">>, proplists:get_value(info, Summary)),
+        ?assertEqual(2, proplists:get_value(error_count, Summary))
+    end).
+
+
 % Test helper functions
 
 setup() ->

-- 
To stop receiving notification emails like this one, please contact
vatam...@apache.org.

Reply via email to