This is an automated email from the ASF dual-hosted git repository.

vatamane pushed a commit to branch handle-upgrade-case-for-instance-start-time
in repository https://gitbox.apache.org/repos/asf/couchdb.git

commit 3b18d04cfc5eb723bd514ebc4659d3fd72842b51
Author: Nick Vatamaniuc <[email protected]>
AuthorDate: Wed May 24 23:34:45 2023 -0400

    Handle replicator instance start time during upgrades better
    
    During cluster upgrades from 3.2 to 3.3 when instance start time switched 
from
    being always  `0` to an actual timestamp, replication jobs will crash when
    endpoints are upgraded. Replication jobs were started when endpoint
    emitted a `0` and then it becomes a non-`0` value which will crash the next 
checkpoint attempt.
    
    After the crash jobs will restart and continue fine were they left off 
without
    rewinding. However they will make a logging mess while they crash. All four
    workers will exit the `{checkpoint_commit_failure,...}` error. This commit 
make
    it the checkpoint ignore mismatches if one of the instance start times is 0.
---
 .../src/couch_replicator_scheduler_job.erl          | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/src/couch_replicator/src/couch_replicator_scheduler_job.erl 
b/src/couch_replicator/src/couch_replicator_scheduler_job.erl
index e16412e4a..cd751d8f2 100644
--- a/src/couch_replicator/src/couch_replicator_scheduler_job.erl
+++ b/src/couch_replicator/src/couch_replicator_scheduler_job.erl
@@ -785,9 +785,9 @@ do_checkpoint(State) ->
         current_through_seq = {_Ts, NewSeq} = NewTsSeq,
         source_log = SourceLog,
         target_log = TargetLog,
-        rep_starttime = ReplicationStartTime,
-        src_starttime = SrcInstanceStartTime,
-        tgt_starttime = TgtInstanceStartTime,
+        rep_starttime = RepStartTs,
+        src_starttime = SrcStartTs,
+        tgt_starttime = TgtStartTs,
         stats = Stats,
         rep_details = #rep{options = Options},
         session_id = SessionId
@@ -799,13 +799,16 @@ do_checkpoint(State) ->
         {target_error, Reason} ->
             {checkpoint_commit_failure,
                 <<"Failure on target commit: ", (to_binary(Reason))/binary>>};
-        {SrcInstanceStartTime, TgtInstanceStartTime} ->
+        {<<S/binary>>, <<T/binary>>} when
+            (S =:= SrcStartTs orelse T =:= <<"0">> orelse SrcStartTs =:= 
<<"0">>) andalso
+                (T =:= TgtStartTs orelse T =:= <<"0">> orelse TgtStartTs =:= 
<<"0">>)
+        ->
             couch_log:notice(
                 "recording a checkpoint for `~s` -> `~s` at source update_seq 
~p",
                 [SourceName, TargetName, NewSeq]
             ),
-            LocalStartTime = calendar:now_to_local_time(ReplicationStartTime),
-            StartTime = ?l2b(httpd_util:rfc1123_date(LocalStartTime)),
+            LocalStartTs = calendar:now_to_local_time(RepStartTs),
+            StartTime = ?l2b(httpd_util:rfc1123_date(LocalStartTs)),
             EndTime = ?l2b(httpd_util:rfc1123_date()),
             NewHistoryEntry =
                 {[
@@ -870,15 +873,15 @@ do_checkpoint(State) ->
                 throw:{checkpoint_commit_failure, _} = Failure ->
                     Failure
             end;
-        {SrcInstanceStartTime, _NewTgtInstanceStartTime} ->
+        {SrcStartTs, _NewTgtStartTs} ->
             {checkpoint_commit_failure, <<
                 "instance_start_time on target database has changed since last 
checkpoint."
             >>};
-        {_NewSrcInstanceStartTime, TgtInstanceStartTime} ->
+        {_NewSrcStartTs, TgtStartTs} ->
             {checkpoint_commit_failure, <<
                 "instance_start_time on source database has changed since last 
checkpoint."
             >>};
-        {_NewSrcInstanceStartTime, _NewTgtInstanceStartTime} ->
+        {_NewSrcStartTs, _NewTgtStartTs} ->
             {checkpoint_commit_failure, <<
                 "instance_start_time on source and target database has changed 
since last checkpoint."
             >>}

Reply via email to