On 6/25/23 19:35, Han Zhou wrote:
> When a server becomes unstable due to overloading or intermittent
> partitioning, it may miss some heartbeats and then starts election with
> a new term, which would disrupt the otherwise healthy cluster formed by
> the rest of the healthy nodes.  Such situation may exist for a long time
> until the "flapping" server is shutdown or recovered completely, which
> can severely impact the availability of the cluster. The pre-vote
> mechanism introduced in the raft paper section 9.6 can prevent such
> problems. This patch implements the pre-vote mechanism, in a way that is
> backward compatible and safe for upgrades.

Thanks, Han!  This is definitely interesting.  I don't think that this
should be a solution for overloading though, overloading should be fixed
by, well, reducing the load or increasing the election timer.  But some
intermittent network issues is indeed a valid case.

I looked through the code and I don't see any significant problems right
away, but I would like to experiment with it a bit more next week, since
the voting logic is fairly complex in general.

It's nice to see you found a use case for the stop-raft-rpc failure
test. :)

See some comments inline.

Best regards, Ilya Maximets.

> 
> Signed-off-by: Han Zhou <[email protected]>
> ---
>  ovsdb/raft-rpc.c       | 19 ++++++++-
>  ovsdb/raft-rpc.h       |  3 ++
>  ovsdb/raft.c           | 88 ++++++++++++++++++++++++++++++------------
>  tests/ovsdb-cluster.at | 42 ++++++++++++++++++++
>  4 files changed, 127 insertions(+), 25 deletions(-)
> 
> diff --git a/ovsdb/raft-rpc.c b/ovsdb/raft-rpc.c
> index dd14d81091fc..f750ba897046 100644
> --- a/ovsdb/raft-rpc.c
> +++ b/ovsdb/raft-rpc.c
> @@ -283,6 +283,10 @@ raft_vote_request_to_jsonrpc(const struct 
> raft_vote_request *rq,
>          json_object_put(args, "leadership_transfer",
>                          json_boolean_create(true));
>      }
> +    if (rq->is_prevote) {
> +        json_object_put(args, "is_prevote",
> +                        json_boolean_create(true));
> +    }
>  }
>  
>  static void
> @@ -294,6 +298,8 @@ raft_vote_request_from_jsonrpc(struct ovsdb_parser *p,
>      rq->last_log_term = raft_parse_required_uint64(p, "last_log_term");
>      rq->leadership_transfer
>          = raft_parse_optional_boolean(p, "leadership_transfer") == 1;
> +    rq->is_prevote
> +        = raft_parse_optional_boolean(p, "is_prevote") == 1;
>  }
>  
>  static void
> @@ -305,6 +311,9 @@ raft_format_vote_request(const struct raft_vote_request 
> *rq, struct ds *s)
>      if (rq->leadership_transfer) {
>          ds_put_cstr(s, " leadership_transfer=true");
>      }
> +    if (rq->is_prevote) {
> +        ds_put_cstr(s, " is_prevote=true");
> +    }
>  }
>  
>  /* raft_vote_reply. */
> @@ -326,6 +335,9 @@ raft_vote_reply_to_jsonrpc(const struct raft_vote_reply 
> *rpy,
>  {
>      raft_put_uint64(args, "term", rpy->term);
>      json_object_put_format(args, "vote", UUID_FMT, UUID_ARGS(&rpy->vote));
> +    if (rpy->is_prevote) {
> +        json_object_put(args, "is_prevote", json_boolean_create(true));
> +    }
>  }
>  
>  static void
> @@ -334,6 +346,7 @@ raft_vote_reply_from_jsonrpc(struct ovsdb_parser *p,
>  {
>      rpy->term = raft_parse_required_uint64(p, "term");
>      rpy->vote = raft_parse_required_uuid(p, "vote");
> +    rpy->is_prevote = raft_parse_optional_boolean(p, "is_prevote") == 1;
>  }
>  
>  static void
> @@ -341,6 +354,9 @@ raft_format_vote_reply(const struct raft_vote_reply *rpy, 
> struct ds *s)
>  {
>      ds_put_format(s, " term=%"PRIu64, rpy->term);
>      ds_put_format(s, " vote="SID_FMT, SID_ARGS(&rpy->vote));
> +    if (rpy->is_prevote) {
> +        ds_put_cstr(s, " is_prevote=true");
> +    }
>  }
>  
>  /* raft_add_server_request */
> @@ -1008,7 +1024,8 @@ raft_rpc_get_vote(const union raft_rpc *rpc)
>          return NULL;
>  
>      case RAFT_RPC_VOTE_REPLY:
> -        return &raft_vote_reply_cast(rpc)->vote;
> +        return raft_vote_reply_cast(rpc)->is_prevote ? NULL :
> +            &raft_vote_reply_cast(rpc)->vote;

It might be better to create a pointer variable here and access it
instead of casting twice.

>  
>      default:
>          OVS_NOT_REACHED();
> diff --git a/ovsdb/raft-rpc.h b/ovsdb/raft-rpc.h
> index 221f24d00128..10f30618e3e0 100644
> --- a/ovsdb/raft-rpc.h
> +++ b/ovsdb/raft-rpc.h
> @@ -125,12 +125,15 @@ struct raft_vote_request {
>      uint64_t last_log_index; /* Index of candidate's last log entry. */
>      uint64_t last_log_term;  /* Term of candidate's last log entry. */
>      bool leadership_transfer;  /* True to override minimum election timeout. 
> */
> +    bool is_prevote;         /* True: pre-vote; False: real vote (default). 
> */
>  };
>  
>  struct raft_vote_reply {
>      struct raft_rpc_common common;
>      uint64_t term;          /* Current term, for candidate to update itself. 
> */
>      struct uuid vote;       /* Server ID of vote. */
> +    bool is_prevote;        /* Copy the is_prevote from the request, 
> primarily
> +                               for validation. */

Each comment line should start with a '*'.

"Copy of the ..." ?

>  };
>  
>  struct raft_add_server_request {
> diff --git a/ovsdb/raft.c b/ovsdb/raft.c
> index b2361b1737a2..4d7a3f112cad 100644
> --- a/ovsdb/raft.c
> +++ b/ovsdb/raft.c
> @@ -305,6 +305,11 @@ struct raft {
>  
>      /* Candidates only.  Reinitialized at start of election. */
>      int n_votes;                /* Number of votes for me. */
> +    bool prevote_passed;        /* Indicates if it passed pre-vote phase.
> +                                   Pre-vote mechanism is introduced in raft
> +                                   paper section 9.6. We implement it as a
> +                                   sub-state of candidate to minize the 
> change
> +                                   and keep backward compatibility. */

Each comment line should start with a '*'.

And s/minize/minimize/.

>  
>      /* Followers and candidates only. */
>      bool candidate_retrying;    /* The earlier election timed-out and we are
> @@ -372,7 +377,8 @@ static void raft_become_follower(struct raft *);
>  static void raft_reset_election_timer(struct raft *);
>  static void raft_reset_ping_timer(struct raft *);
>  static void raft_send_heartbeats(struct raft *);
> -static void raft_start_election(struct raft *, bool leadership_transfer);
> +static void raft_start_election(struct raft *, bool is_prevote,
> +                                bool leadership_transfer);
>  static bool raft_truncate(struct raft *, uint64_t new_end);
>  static void raft_get_servers_from_log(struct raft *, enum vlog_level);
>  static void raft_get_election_timer_from_log(struct raft *);
> @@ -1069,7 +1075,8 @@ raft_open(struct ovsdb_log *log, struct raft **raftp)
>          /* If there's only one server, start an election right away so that 
> the
>           * cluster bootstraps quickly. */
>          if (hmap_count(&raft->servers) == 1) {
> -            raft_start_election(raft, false);
> +            /* No pre-vote needed since we are the only one. */
> +            raft_start_election(raft, false, false);
>          }
>      } else {
>          raft->join_timeout = time_msec() + 1000;
> @@ -1360,7 +1367,7 @@ void
>  raft_take_leadership(struct raft *raft)
>  {
>      if (raft->role != RAFT_LEADER) {
> -        raft_start_election(raft, true);
> +        raft_start_election(raft, false, true);
>      }
>  }
>  
> @@ -1766,12 +1773,12 @@ raft_set_term(struct raft *raft, uint64_t term, const 
> struct uuid *vote)
>      return true;
>  }
>  
> -static void
> +static bool
>  raft_accept_vote(struct raft *raft, struct raft_server *s,
>                   const struct uuid *vote)
>  {
>      if (uuid_equals(&s->vote, vote)) {
> -        return;
> +        return false;
>      }
>      if (!uuid_is_zero(&s->vote)) {
>          static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
> @@ -1785,13 +1792,18 @@ raft_accept_vote(struct raft *raft, struct 
> raft_server *s,
>      s->vote = *vote;
>      if (uuid_equals(vote, &raft->sid)
>          && ++raft->n_votes > hmap_count(&raft->servers) / 2) {
> -        raft_become_leader(raft);
> +        return true;
>      }
> +    return false;
>  }
>  
>  static void
> -raft_start_election(struct raft *raft, bool leadership_transfer)
> +raft_start_election(struct raft *raft, bool is_prevote,
> +                    bool leadership_transfer)
>  {
> +    /* Leadership transfer doesn't use prevote. */
> +    ovs_assert(!is_prevote || !leadership_transfer);
> +
>      if (raft->leaving) {
>          return;
>      }
> @@ -1801,7 +1813,7 @@ raft_start_election(struct raft *raft, bool 
> leadership_transfer)
>          return;
>      }
>  
> -    if (!raft_set_term(raft, raft->term + 1, &raft->sid)) {
> +    if (!is_prevote && !raft_set_term(raft, raft->term + 1, &raft->sid)) {
>          return;
>      }
>  
> @@ -1809,26 +1821,33 @@ raft_start_election(struct raft *raft, bool 
> leadership_transfer)
>  
>      raft->leader_sid = UUID_ZERO;
>      raft->role = RAFT_CANDIDATE;
> -    /* If there was no leader elected since last election, we know we are
> -     * retrying now. */
> -    raft->candidate_retrying = !raft->had_leader;
> -    raft->had_leader = false;
> +    raft->prevote_passed = !is_prevote;
> +
> +    if (is_prevote || leadership_transfer) {
> +        /* If there was no leader elected since last election, we know we are
> +         * retrying now. */
> +        raft->candidate_retrying = !raft->had_leader;
> +        raft->had_leader = false;
> +
> +        raft->election_start = time_msec();
> +        raft->election_won = 0;
> +    }
>  
>      raft->n_votes = 0;
>  
> -    raft->election_start = time_msec();
> -    raft->election_won = 0;
>      raft->leadership_transfer = leadership_transfer;
>  
>      static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
>      if (!VLOG_DROP_INFO(&rl)) {
>          long long int now = time_msec();
> +        char *comment = is_prevote ? "prevote" : "vote";

Should user-visible logs say "pre-vote" instead of "prevote" ?

>          if (now >= raft->election_timeout) {
>              VLOG_INFO("term %"PRIu64": %lld ms timeout expired, "
> -                      "starting election",
> -                      raft->term, now - raft->election_base);
> +                      "starting election (%s)",
> +                      raft->term, now - raft->election_base, comment);
>          } else {
> -            VLOG_INFO("term %"PRIu64": starting election", raft->term);
> +            VLOG_INFO("term %"PRIu64": starting election (%s)",
> +                      raft->term, comment);
>          }
>      }
>      raft_reset_election_timer(raft);
> @@ -1853,6 +1872,7 @@ raft_start_election(struct raft *raft, bool 
> leadership_transfer)
>                      ? raft->entries[raft->log_end - raft->log_start - 1].term
>                      : raft->snap.term),
>                  .leadership_transfer = leadership_transfer,
> +                .is_prevote = is_prevote,
>              },
>          };
>          if (failure_test != FT_DONT_SEND_VOTE_REQUEST) {
> @@ -1861,7 +1881,13 @@ raft_start_election(struct raft *raft, bool 
> leadership_transfer)
>      }
>  
>      /* Vote for ourselves. */
> -    raft_accept_vote(raft, me, &raft->sid);
> +    if (raft_accept_vote(raft, me, &raft->sid)) {
> +        /* We just started vote, so it shouldn't be accepted yet unless this 
> is
> +         * a one-node cluster. In such case we don't do pre-vote, and become
> +         * leader immediately. */
> +        ovs_assert(!is_prevote);
> +        raft_become_leader(raft);
> +    }
>  }
>  
>  static void
> @@ -2041,10 +2067,10 @@ raft_run(struct raft *raft)
>                  raft_reset_election_timer(raft);
>              } else {
>                  raft_become_follower(raft);
> -                raft_start_election(raft, false);
> +                raft_start_election(raft, true, false);
>              }
>          } else {
> -            raft_start_election(raft, false);
> +            raft_start_election(raft, true, false);
>          }
>  
>      }
> @@ -3673,6 +3699,10 @@ raft_handle_vote_request__(struct raft *raft,
>          return false;
>      }
>  
> +    if (rq->is_prevote) {
> +        return true;
> +    }
> +
>      /* Record a vote for the peer. */
>      if (!raft_set_term(raft, raft->term, &rq->common.sid)) {
>          return false;
> @@ -3685,7 +3715,7 @@ raft_handle_vote_request__(struct raft *raft,
>  
>  static void
>  raft_send_vote_reply(struct raft *raft, const struct uuid *dst,
> -                     const struct uuid *vote)
> +                     const struct uuid *vote, bool is_prevote)
>  {
>      union raft_rpc rpy = {
>          .vote_reply = {
> @@ -3695,6 +3725,7 @@ raft_send_vote_reply(struct raft *raft, const struct 
> uuid *dst,
>              },
>              .term = raft->term,
>              .vote = *vote,
> +            .is_prevote = is_prevote,
>          },
>      };
>      raft_send(raft, &rpy);
> @@ -3705,7 +3736,9 @@ raft_handle_vote_request(struct raft *raft,
>                           const struct raft_vote_request *rq)
>  {
>      if (raft_handle_vote_request__(raft, rq)) {
> -        raft_send_vote_reply(raft, &rq->common.sid, &raft->vote);
> +        raft_send_vote_reply(raft, &rq->common.sid,
> +                             rq->is_prevote ? &rq->common.sid : &raft->vote,
> +                             rq->is_prevote);
>      }
>  }
>  
> @@ -3723,7 +3756,14 @@ raft_handle_vote_reply(struct raft *raft,
>  
>      struct raft_server *s = raft_find_peer(raft, &rpy->common.sid);
>      if (s) {
> -        raft_accept_vote(raft, s, &rpy->vote);
> +        if (raft_accept_vote(raft, s, &rpy->vote)) {

IIUC, the raft_accept_vote() here can't succeed for pre-vote because
everyone votes for themselves on pre-vote.  Is that correct?

What happens if we received a real vote during a pre-vote phase for
some reason?  Will this logic fail?  Is this scenario possible?

> +            if (raft->prevote_passed) {
> +                raft_become_leader(raft);
> +            } else {
> +                /* Start the real election. */
> +                raft_start_election(raft, false, false);
> +            }
> +        }
>      }
>  }
>  
> @@ -4357,7 +4397,7 @@ raft_handle_become_leader(struct raft *raft,
>          VLOG_INFO("received leadership transfer from %s in term %"PRIu64,
>                    raft_get_nickname(raft, &rq->common.sid, buf, sizeof buf),
>                    rq->term);
> -        raft_start_election(raft, true);
> +        raft_start_election(raft, false, true);
>      }
>  }
>  
> diff --git a/tests/ovsdb-cluster.at b/tests/ovsdb-cluster.at
> index 9fbf5dc897f2..8a81136999e0 100644
> --- a/tests/ovsdb-cluster.at
> +++ b/tests/ovsdb-cluster.at
> @@ -715,6 +715,48 @@ done
>  
>  AT_CLEANUP
>  
> +
> +AT_SETUP([OVSDB cluster - disruptive server])
> +AT_KEYWORDS([ovsdb server negative unix cluster disruptive])
> +
> +n=3
> +schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema`
> +ordinal_schema > schema
> +AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db 
> $abs_srcdir/idltest.ovsschema unix:s1.raft], [0], [], [stderr])
> +cid=`ovsdb-tool db-cid s1.db`
> +schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema`
> +for i in `seq 2 $n`; do
> +    AT_CHECK([ovsdb-tool join-cluster s$i.db $schema_name unix:s$i.raft 
> unix:s1.raft])
> +done
> +
> +on_exit 'kill `cat *.pid`'
> +for i in `seq $n`; do
> +    AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off --detach --no-chdir 
> --log-file=s$i.log --pidfile=s$i.pid --unixctl=s$i --remote=punix:s$i.ovsdb 
> s$i.db])
> +done
> +for i in `seq $n`; do
> +    AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected])
> +done
> +
> +# An unstable follower shouldn't disrupt the healthy cluster - shouldn't
> +# trigger term change.
> +AT_CHECK([ovs-appctl -t "`pwd`"/s2 cluster/failure-test stop-raft-rpc], [0], 
> [ignore])
> +OVS_WAIT_UNTIL([ovs-appctl -t "`pwd`"/s2 cluster/status $schema_name | grep 
> "Role: candidate"])
> +AT_CHECK([ovs-appctl -t "`pwd`"/s2 cluster/failure-test clear], [0], 
> [ignore])
> +
> +# Should step back to follower.
> +OVS_WAIT_UNTIL([ovs-appctl -t "`pwd`"/s2 cluster/status $schema_name | grep 
> "Role: follower"])
> +
> +# No term change.
> +for i in `seq $n`; do
> +    AT_CHECK([ovs-appctl -t "`pwd`"/s$i cluster/status $schema_name | grep 
> "Term: 1"], [0], [ignore])
> +done
> +
> +for i in `seq $n`; do
> +    OVS_APP_EXIT_AND_WAIT_BY_TARGET([`pwd`/s$i], [s$i.pid])
> +done
> +
> +AT_CLEANUP
> +
>  
>  AT_BANNER([OVSDB - cluster tests])
>  

_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to