On Tue, Aug 13, 2019 at 9:23 AM Han Zhou <[email protected]> wrote: > > From: Han Zhou <[email protected]> > > As mentioned in RAFT paper, section 6.2: > > Leaders: A server might be in the leader state, but if it isn’t the current > leader, it could be needlessly delaying client requests. For example, suppose a > leader is partitioned from the rest of the cluster, but it can still > communicate with a particular client. Without additional mechanism, it could > delay a request from that client forever, being unable to replicate a log entry > to any other servers. Meanwhile, there might be another leader of a newer term > that is able to communicate with a majority of the cluster and would be able to > commit the client’s request. Thus, a leader in Raft steps down if an election > timeout elapses without a successful round of heartbeats to a majority of its > cluster; this allows clients to retry their requests with another server. > > Signed-off-by: Han Zhou <[email protected]> > --- > ovsdb/raft-private.h | 3 ++ > ovsdb/raft.c | 42 ++++++++++++++++- > tests/ovsdb-cluster.at | 123 +++++++++++++++++++++++++++++-------------------- > 3 files changed, 116 insertions(+), 52 deletions(-) > > diff --git a/ovsdb/raft-private.h b/ovsdb/raft-private.h > index 35a3dd7..fb7e6e3 100644 > --- a/ovsdb/raft-private.h > +++ b/ovsdb/raft-private.h > @@ -80,6 +80,9 @@ struct raft_server { > uint64_t next_index; /* Index of next log entry to send this server. */ > uint64_t match_index; /* Index of max log entry server known to have. */ > enum raft_server_phase phase; > + bool replied; /* Reply to append_request was received from this > + node during current election_timeout interval. > + */ > /* For use in adding and removing servers: */ > struct uuid requester_sid; /* Nonzero if requested via RPC. */ > struct unixctl_conn *requester_conn; /* Only if requested via unixctl. */ > diff --git a/ovsdb/raft.c b/ovsdb/raft.c > index 63c3081..9516a6f 100644 > --- a/ovsdb/raft.c > +++ b/ovsdb/raft.c > @@ -1792,7 +1792,43 @@ raft_run(struct raft *raft) > } > > if (!raft->joining && time_msec() >= raft->election_timeout) { > - raft_start_election(raft, false); > + if (raft->role == RAFT_LEADER) { > + /* Check if majority of followers replied, then reset > + * election_timeout and reset s->replied. Otherwise, become > + * follower. > + * > + * Raft paper section 6.2: Leaders: A server might be in the leader > + * state, but if it isn’t the current leader, it could be > + * needlessly delaying client requests. For example, suppose a > + * leader is partitioned from the rest of the cluster, but it can > + * still communicate with a particular client. Without additional > + * mechanism, it could delay a request from that client forever, > + * being unable to replicate a log entry to any other servers. > + * Meanwhile, there might be another leader of a newer term that is > + * able to communicate with a majority of the cluster and would be > + * able to commit the client’s request. Thus, a leader in Raft > + * steps down if an election timeout elapses without a successful > + * round of heartbeats to a majority of its cluster; this allows > + * clients to retry their requests with another server. */ > + int count = 0; > + HMAP_FOR_EACH (server, hmap_node, &raft->servers) { > + if (server->replied) { > + count ++; > + } > + } > + if (count >= hmap_count(&raft->servers) / 2) { > + HMAP_FOR_EACH (server, hmap_node, &raft->servers) { > + server->replied = false; > + } > + raft_reset_election_timer(raft); > + } else { > + raft_become_follower(raft); > + raft_start_election(raft, false); > + } > + } else { > + raft_start_election(raft, false); > + } > + > } > > if (raft->leaving && time_msec() >= raft->leave_timeout) { > @@ -2454,6 +2490,7 @@ raft_server_init_leader(struct raft *raft, struct raft_server *s) > s->next_index = raft->log_end; > s->match_index = 0; > s->phase = RAFT_PHASE_STABLE; > + s->replied = false; > } > > static void > @@ -2477,7 +2514,7 @@ raft_become_leader(struct raft *raft) > ovs_assert(raft->role != RAFT_LEADER); > raft->role = RAFT_LEADER; > raft_set_leader(raft, &raft->sid); > - raft->election_timeout = LLONG_MAX; > + raft_reset_election_timer(raft); > raft_reset_ping_timer(raft); > > struct raft_server *s; > @@ -3207,6 +3244,7 @@ raft_handle_append_reply(struct raft *raft, > } > } > > + s->replied = true; > if (rpy->result == RAFT_APPEND_OK) { > /* Figure 3.1: "If successful, update nextIndex and matchIndex for > * follower (section 3.5)." */ > diff --git a/tests/ovsdb-cluster.at b/tests/ovsdb-cluster.at > index 6a13843..7146fe6 100644 > --- a/tests/ovsdb-cluster.at > +++ b/tests/ovsdb-cluster.at > @@ -65,59 +65,82 @@ EXECUTION_EXAMPLES > > AT_BANNER([OVSDB - disconnect from cluster]) > > -# When a node is disconnected from the cluster, the IDL should disconnect > -# and retry even if it uses a single remote, because the remote IP can be > -# a VIP on a load-balance. > -AT_SETUP([OVSDB cluster - disconnect from cluster, single remote]) > -AT_KEYWORDS([ovsdb server negative unix cluster disconnect]) > +OVS_START_SHELL_HELPERS > +# ovsdb_test_cluster_disconnect LEADER_OR_FOLLOWER > +ovsdb_test_cluster_disconnect () { > + leader_or_follower=$1 > + schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema` > + ordinal_schema > schema > + AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db $abs_srcdir/idltest.ovsschema unix:s1.raft], [0], [], [stderr]) > + cid=`ovsdb-tool db-cid s1.db` > + schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema` > + for i in `seq 2 3`; do > + AT_CHECK([ovsdb-tool join-cluster s$i.db $schema_name unix:s$i.raft unix:s1.raft]) > + done > + > + on_exit 'kill `cat *.pid`' > + for i in `seq 3`; do > + AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off --detach --no-chdir --log-file=s$i.log --pidfile=s$i.pid --unixctl=s$i --remote=punix:s$i.ovsdb s$i.db]) > + done > + for i in `seq 3`; do > + AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected]) > + done > + > + AT_CHECK([ovsdb-client transact unix:s1.ovsdb '[["idltest", > + {"op": "insert", > + "table": "simple", > + "row": {"i": 1}}]]'], [0], [ignore], [ignore]) > + > + # When a node is disconnected from the cluster, the IDL should disconnect > + # and retry even if it uses a single remote, because the remote IP can be > + # a VIP on a load-balance. So we use single remote to test here. > + if test $leader_or_follower == "leader"; then > + target=1 > + shutdown="2 3" > + else > + target=3 > + > + # shutdown follower before the leader so that there is no chance for s3 > + # become leader during the process. > + shutdown="2 1" > + fi > + > + # Connect to $target. Use "wait" to trigger a non-op transaction so > + # that test-ovsdb will not quit. > + > + on_exit 'pkill test-ovsdb' > + test-ovsdb '-vPATTERN:console:test-ovsdb|%c|%m' -v -t10 idl unix:s$target.ovsdb '[["idltest", > + {"op": "wait", > + "table": "simple", > + "where": [["i", "==", 1]], > + "columns": ["i"], > + "until": "==", > + "rows": [{"i": 1}]}]]' > test-ovsdb.log 2>&1 & > > -schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema` > -ordinal_schema > schema > -AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db $abs_srcdir/idltest.ovsschema unix:s1.raft], [0], [], [stderr]) > -cid=`ovsdb-tool db-cid s1.db` > -schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema` > -for i in `seq 2 3`; do > - AT_CHECK([ovsdb-tool join-cluster s$i.db $schema_name unix:s$i.raft unix:s1.raft]) > -done > - > -on_exit 'kill `cat *.pid`' > -for i in `seq 3`; do > - AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off --detach --no-chdir --log-file=s$i.log --pidfile=s$i.pid --unixctl=s$i --remote=punix:s$i.ovsdb s$i.db]) > -done > -for i in `seq 3`; do > - AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected]) > -done > - > -AT_CHECK([ovsdb-client transact unix:s1.ovsdb '[["idltest", > - {"op": "insert", > - "table": "simple", > - "row": {"i": 1}}]]'], [0], [ignore], [ignore]) > - > -# Connect to a single remote s3. Use "wait" to trigger a non-op transaction so > -# that test-ovsdb will not quit. > - > -on_exit 'pkill test-ovsdb' > -test-ovsdb '-vPATTERN:console:test-ovsdb|%c|%m' -v -t10 idl unix:s3.ovsdb '[["idltest", > - {"op": "wait", > - "table": "simple", > - "where": [["i", "==", 1]], > - "columns": ["i"], > - "until": "==", > - "rows": [{"i": 1}]}]]' > test-ovsdb.log 2>&1 & > - > -OVS_WAIT_UNTIL([grep "000: i=1" test-ovsdb.log]) > - > -# Shutdown the other 2 servers so that s3 is disconnected from the cluster. > -for i in 2 1; do > - OVS_APP_EXIT_AND_WAIT_BY_TARGET([`pwd`/s$i], [s$i.pid]) > -done > - > -# The test-ovsdb should detect the disconnect and retry. > -OVS_WAIT_UNTIL([grep disconnect test-ovsdb.log]) > - > -OVS_APP_EXIT_AND_WAIT_BY_TARGET([`pwd`/s3], [s3.pid]) > + OVS_WAIT_UNTIL([grep "000: i=1" test-ovsdb.log]) > > + # Shutdown the other servers so that $target is disconnected from the cluster. > + for i in $shutdown; do > + OVS_APP_EXIT_AND_WAIT_BY_TARGET([`pwd`/s$i], [s$i.pid]) > + done > + > + # The test-ovsdb should detect the disconnect and retry. > + OVS_WAIT_UNTIL([grep disconnect test-ovsdb.log]) > + > + OVS_APP_EXIT_AND_WAIT_BY_TARGET([`pwd`/s$target], [s$target.pid]) > +} > +OVS_END_SHELL_HELPERS > + > +AT_SETUP([OVSDB cluster - follower disconnect from cluster, single remote]) > +AT_KEYWORDS([ovsdb server negative unix cluster disconnect]) > +ovsdb_test_cluster_disconnect follower > +AT_CLEANUP > + > +AT_SETUP([OVSDB cluster - leader disconnect from cluster, single remote]) > +AT_KEYWORDS([ovsdb server negative unix cluster disconnect]) > +ovsdb_test_cluster_disconnect leader > AT_CLEANUP > + > > > OVS_START_SHELL_HELPERS > -- > 2.1.0 >
Sorry that I forgot to add: Reported-by: Aliasgar Ginwala <[email protected]> Tested-by: Aliasgar Ginwala <[email protected]> I will wait for comments and add it to v2 (if v2 is needed). _______________________________________________ dev mailing list [email protected] https://mail.openvswitch.org/mailman/listinfo/ovs-dev
