On 20.03.2019 8:56, Han Zhou wrote: > From: Han Zhou <[email protected]> > > When update is requested from follower, the leader sends AppendRequest > to all followers and wait until AppendReply received from majority, and > then it will update commit index - the new entry is regarded as committed > in raft log. However, this commit will not be notified to followers > (including the one initiated the request) until next heartbeat (ping > timeout), if no other pending requests. This results in long latency > for updates made through followers, especially when a batch of updates > are requested through the same follower. > > $ time for i in `seq 1 100`; do ovn-nbctl ls-add ls$i; done > > real 0m34.154s > user 0m0.083s > sys 0m0.250s > > This patch solves the problem by sending heartbeat as soon as the commit > index is updated in leader. It also avoids unnessary heartbeat by resetting > the ping timer whenever AppendRequest is broadcasted. With this patch > the performance is improved more than 50 times in same test: > > $ time for i in `seq 1 100`; do ovn-nbctl ls-add ls$i; done > > real 0m0.564s > user 0m0.080s > sys 0m0.199s > > Torture test cases are also updated because otherwise the tests will > all be skipped because of the improved performance. > > Signed-off-by: Han Zhou <[email protected]> > --- > > Notes: > v3->v4: Update torture tests again. Instead of sleeping, the size of > transaction of each client is increased to slow down the execution so > that the > chance of parallel executions are not reduced. >
Unfortunately, this patch fails all the testsuite runs on TravisCI: https://travis-ci.org/ovsrobot/ovs/builds/508777615 And some on CirrusCI too: https://cirrus-ci.com/build/5201766546145280 Best regards, Ilya Maximets. > ovsdb/raft.c | 43 +++++++++++++++++++++++++++++-------------- > tests/ovsdb-cluster.at | 16 +++++++++++----- > 2 files changed, 40 insertions(+), 19 deletions(-) > > diff --git a/ovsdb/raft.c b/ovsdb/raft.c > index eee4f33..31e9e72 100644 > --- a/ovsdb/raft.c > +++ b/ovsdb/raft.c > @@ -320,7 +320,8 @@ static void raft_send_append_request(struct raft *, > > static void raft_become_leader(struct raft *); > static void raft_become_follower(struct raft *); > -static void raft_reset_timer(struct raft *); > +static void raft_reset_election_timer(struct raft *); > +static void raft_reset_ping_timer(struct raft *); > static void raft_send_heartbeats(struct raft *); > static void raft_start_election(struct raft *, bool leadership_transfer); > static bool raft_truncate(struct raft *, uint64_t new_end); > @@ -376,8 +377,8 @@ raft_alloc(void) > hmap_init(&raft->add_servers); > hmap_init(&raft->commands); > > - raft->ping_timeout = time_msec() + PING_TIME_MSEC; > - raft_reset_timer(raft); > + raft_reset_ping_timer(raft); > + raft_reset_election_timer(raft); > > return raft; > } > @@ -865,7 +866,7 @@ raft_read_log(struct raft *raft) > } > > static void > -raft_reset_timer(struct raft *raft) > +raft_reset_election_timer(struct raft *raft) > { > unsigned int duration = (ELECTION_BASE_MSEC > + random_range(ELECTION_RANGE_MSEC)); > @@ -874,6 +875,12 @@ raft_reset_timer(struct raft *raft) > } > > static void > +raft_reset_ping_timer(struct raft *raft) > +{ > + raft->ping_timeout = time_msec() + PING_TIME_MSEC; > +} > + > +static void > raft_add_conn(struct raft *raft, struct jsonrpc_session *js, > const struct uuid *sid, bool incoming) > { > @@ -1603,7 +1610,7 @@ raft_start_election(struct raft *raft, bool > leadership_transfer) > VLOG_INFO("term %"PRIu64": starting election", raft->term); > } > } > - raft_reset_timer(raft); > + raft_reset_election_timer(raft); > > struct raft_server *peer; > HMAP_FOR_EACH (peer, hmap_node, &raft->servers) { > @@ -1782,8 +1789,8 @@ raft_run(struct raft *raft) > raft_command_complete(raft, cmd, RAFT_CMD_TIMEOUT); > } > } > + raft_reset_ping_timer(raft); > } > - raft->ping_timeout = time_msec() + PING_TIME_MSEC; > } > > /* Do this only at the end; if we did it as soon as we set raft->left or > @@ -1963,6 +1970,7 @@ raft_command_initiate(struct raft *raft, > s->next_index++; > } > } > + raft_reset_ping_timer(raft); > > return cmd; > } > @@ -2313,7 +2321,7 @@ raft_become_follower(struct raft *raft) > } > > raft->role = RAFT_FOLLOWER; > - raft_reset_timer(raft); > + raft_reset_election_timer(raft); > > /* Notify clients about lost leadership. > * > @@ -2387,6 +2395,8 @@ raft_send_heartbeats(struct raft *raft) > RAFT_CMD_INCOMPLETE, 0); > } > } > + > + raft_reset_ping_timer(raft); > } > > /* Initializes the fields in 's' that represent the leader's view of the > @@ -2413,7 +2423,7 @@ raft_become_leader(struct raft *raft) > raft->role = RAFT_LEADER; > raft->leader_sid = raft->sid; > raft->election_timeout = LLONG_MAX; > - raft->ping_timeout = time_msec() + PING_TIME_MSEC; > + raft_reset_ping_timer(raft); > > struct raft_server *s; > HMAP_FOR_EACH (s, hmap_node, &raft->servers) { > @@ -2573,11 +2583,13 @@ raft_get_next_entry(struct raft *raft, struct uuid > *eid) > return data; > } > > -static void > +/* Updates commit index in raft log. If commit index is already up-to-date > + * it does nothing and return false, otherwise, returns true. */ > +static bool > raft_update_commit_index(struct raft *raft, uint64_t new_commit_index) > { > if (new_commit_index <= raft->commit_index) { > - return; > + return false; > } > > if (raft->role == RAFT_LEADER) { > @@ -2610,6 +2622,7 @@ raft_update_commit_index(struct raft *raft, uint64_t > new_commit_index) > .commit_index = raft->commit_index, > }; > ignore(ovsdb_log_write_and_free(raft->log, raft_record_to_json(&r))); > + return true; > } > > /* This doesn't use rq->entries (but it does use rq->n_entries). */ > @@ -2809,7 +2822,7 @@ raft_handle_append_request(struct raft *raft, > "usurped leadership"); > return; > } > - raft_reset_timer(raft); > + raft_reset_election_timer(raft); > > /* First check for the common case, where the AppendEntries request is > * entirely for indexes covered by 'log_start' ... 'log_end - 1', > something > @@ -3045,7 +3058,9 @@ raft_consider_updating_commit_index(struct raft *raft) > } > } > } > - raft_update_commit_index(raft, new_commit_index); > + if (raft_update_commit_index(raft, new_commit_index)) { > + raft_send_heartbeats(raft); > + } > } > > static void > @@ -3274,7 +3289,7 @@ raft_handle_vote_request__(struct raft *raft, > return false; > } > > - raft_reset_timer(raft); > + raft_reset_election_timer(raft); > > return true; > } > @@ -3697,7 +3712,7 @@ static bool > raft_handle_install_snapshot_request__( > struct raft *raft, const struct raft_install_snapshot_request *rq) > { > - raft_reset_timer(raft); > + raft_reset_election_timer(raft); > > /* > * Our behavior here depend on new_log_start in the snapshot compared to > diff --git a/tests/ovsdb-cluster.at b/tests/ovsdb-cluster.at > index c7f1e34..5550a19 100644 > --- a/tests/ovsdb-cluster.at > +++ b/tests/ovsdb-cluster.at > @@ -131,12 +131,16 @@ ovsdb|WARN|schema: changed 2 columns in > 'OVN_Southbound' database from ephemeral > done > export OVN_SB_DB > > - n1=10 n2=5 > + n1=10 n2=5 n3=50 > echo "starting $n1*$n2 ovn-sbctl processes..." > for i in $(seq 0 $(expr $n1 - 1) ); do > (for j in $(seq $n2); do > : > $i-$j.running > - run_as "ovn-sbctl($i-$j)" ovn-sbctl > "-vPATTERN:console:ovn-sbctl($i-$j)|%D{%H:%M:%S}|%05N|%c|%p|%m" > --log-file=$i-$j.log -vfile -vsyslog:off -vtimeval:off --timeout=120 > --no-leader-only add SB_Global . external_ids $i-$j=$i-$j > + txn="add SB_Global . external_ids $i-$j=$i-$j" > + for k in $(seq $n3); do > + txn="$txn -- add SB_Global . external_ids $i-$j-$k=$i-$j-$k" > + done > + run_as "ovn-sbctl($i-$j)" ovn-sbctl > "-vPATTERN:console:ovn-sbctl($i-$j)|%D{%H:%M:%S}|%05N|%c|%p|%m" > --log-file=$i-$j.log -vfile -vsyslog:off -vtimeval:off --timeout=120 > --no-leader-only $txn > status=$? > if test $status != 0; then > echo "$i-$j exited with status $status" > $i-$j:$status > @@ -146,13 +150,12 @@ ovsdb|WARN|schema: changed 2 columns in > 'OVN_Southbound' database from ephemeral > : > $i.done)& > done > echo "...done" > - sleep 2 > > echo "waiting for ovn-sbctl processes to exit..." > # Use file instead of var because code inside "while" runs in a subshell. > echo 0 > phase > i=0 > - (while :; do echo; sleep 1; done) | while read REPLY; do > + (while :; do echo; sleep 0.1; done) | while read REPLY; do > printf "t=%2d s:" $i > done=0 > for j in $(seq 0 $(expr $n1 - 1)); do > @@ -168,7 +171,7 @@ ovsdb|WARN|schema: changed 2 columns in 'OVN_Southbound' > database from ephemeral > > case $(cat phase) in # ( > 0) > - if test $done -ge $(expr $n1 / 4); then > + if test $done -ge $(expr $n1 / 10); then > if test $variant = kill; then > stop_server $victim > else > @@ -199,6 +202,9 @@ ovsdb|WARN|schema: changed 2 columns in 'OVN_Southbound' > database from ephemeral > for i in $(seq 0 $(expr $n1 - 1) ); do > for j in `seq $n2`; do > echo "$i-$j=$i-$j" > + for k in `seq $n3`; do > + echo "$i-$j-$k=$i-$j-$k" > + done > done > done | sort > expout > AT_CHECK([ovn-sbctl --timeout=30 --log-file=finalize.log -vtimeval:off > -vfile -vsyslog:off --bare get SB_Global . external-ids | tr ',' '\n' | sed > 's/[[{}"" ]]//g' | sort], [0], [expout]) > _______________________________________________ dev mailing list [email protected] https://mail.openvswitch.org/mailman/listinfo/ovs-dev
