On 20.03.2019 8:56, Han Zhou wrote:
> From: Han Zhou <[email protected]>
> 
> When update is requested from follower, the leader sends AppendRequest
> to all followers and wait until AppendReply received from majority, and
> then it will update commit index - the new entry is regarded as committed
> in raft log. However, this commit will not be notified to followers
> (including the one initiated the request) until next heartbeat (ping
> timeout), if no other pending requests. This results in long latency
> for updates made through followers, especially when a batch of updates
> are requested through the same follower.
> 
> $ time for i in `seq 1 100`; do ovn-nbctl ls-add ls$i; done
> 
> real    0m34.154s
> user    0m0.083s
> sys 0m0.250s
> 
> This patch solves the problem by sending heartbeat as soon as the commit
> index is updated in leader. It also avoids unnessary heartbeat by resetting
> the ping timer whenever AppendRequest is broadcasted. With this patch
> the performance is improved more than 50 times in same test:
> 
> $ time for i in `seq 1 100`; do ovn-nbctl ls-add ls$i; done
> 
> real    0m0.564s
> user    0m0.080s
> sys 0m0.199s
> 
> Torture test cases are also updated because otherwise the tests will
> all be skipped because of the improved performance.
> 
> Signed-off-by: Han Zhou <[email protected]>
> ---
> 
> Notes:
>     v3->v4: Update torture tests again. Instead of sleeping, the size of
>     transaction of each client is increased to slow down the execution so 
> that the
>     chance of parallel executions are not reduced.
> 

Unfortunately, this patch fails all the testsuite runs on TravisCI:

  https://travis-ci.org/ovsrobot/ovs/builds/508777615

And some on CirrusCI too:

  https://cirrus-ci.com/build/5201766546145280

Best regards, Ilya Maximets.

>  ovsdb/raft.c           | 43 +++++++++++++++++++++++++++++--------------
>  tests/ovsdb-cluster.at | 16 +++++++++++-----
>  2 files changed, 40 insertions(+), 19 deletions(-)
> 
> diff --git a/ovsdb/raft.c b/ovsdb/raft.c
> index eee4f33..31e9e72 100644
> --- a/ovsdb/raft.c
> +++ b/ovsdb/raft.c
> @@ -320,7 +320,8 @@ static void raft_send_append_request(struct raft *,
>  
>  static void raft_become_leader(struct raft *);
>  static void raft_become_follower(struct raft *);
> -static void raft_reset_timer(struct raft *);
> +static void raft_reset_election_timer(struct raft *);
> +static void raft_reset_ping_timer(struct raft *);
>  static void raft_send_heartbeats(struct raft *);
>  static void raft_start_election(struct raft *, bool leadership_transfer);
>  static bool raft_truncate(struct raft *, uint64_t new_end);
> @@ -376,8 +377,8 @@ raft_alloc(void)
>      hmap_init(&raft->add_servers);
>      hmap_init(&raft->commands);
>  
> -    raft->ping_timeout = time_msec() + PING_TIME_MSEC;
> -    raft_reset_timer(raft);
> +    raft_reset_ping_timer(raft);
> +    raft_reset_election_timer(raft);
>  
>      return raft;
>  }
> @@ -865,7 +866,7 @@ raft_read_log(struct raft *raft)
>  }
>  
>  static void
> -raft_reset_timer(struct raft *raft)
> +raft_reset_election_timer(struct raft *raft)
>  {
>      unsigned int duration = (ELECTION_BASE_MSEC
>                               + random_range(ELECTION_RANGE_MSEC));
> @@ -874,6 +875,12 @@ raft_reset_timer(struct raft *raft)
>  }
>  
>  static void
> +raft_reset_ping_timer(struct raft *raft)
> +{
> +    raft->ping_timeout = time_msec() + PING_TIME_MSEC;
> +}
> +
> +static void
>  raft_add_conn(struct raft *raft, struct jsonrpc_session *js,
>                const struct uuid *sid, bool incoming)
>  {
> @@ -1603,7 +1610,7 @@ raft_start_election(struct raft *raft, bool 
> leadership_transfer)
>              VLOG_INFO("term %"PRIu64": starting election", raft->term);
>          }
>      }
> -    raft_reset_timer(raft);
> +    raft_reset_election_timer(raft);
>  
>      struct raft_server *peer;
>      HMAP_FOR_EACH (peer, hmap_node, &raft->servers) {
> @@ -1782,8 +1789,8 @@ raft_run(struct raft *raft)
>                      raft_command_complete(raft, cmd, RAFT_CMD_TIMEOUT);
>                  }
>              }
> +            raft_reset_ping_timer(raft);
>          }
> -        raft->ping_timeout = time_msec() + PING_TIME_MSEC;
>      }
>  
>      /* Do this only at the end; if we did it as soon as we set raft->left or
> @@ -1963,6 +1970,7 @@ raft_command_initiate(struct raft *raft,
>              s->next_index++;
>          }
>      }
> +    raft_reset_ping_timer(raft);
>  
>      return cmd;
>  }
> @@ -2313,7 +2321,7 @@ raft_become_follower(struct raft *raft)
>      }
>  
>      raft->role = RAFT_FOLLOWER;
> -    raft_reset_timer(raft);
> +    raft_reset_election_timer(raft);
>  
>      /* Notify clients about lost leadership.
>       *
> @@ -2387,6 +2395,8 @@ raft_send_heartbeats(struct raft *raft)
>                                              RAFT_CMD_INCOMPLETE, 0);
>          }
>      }
> +
> +    raft_reset_ping_timer(raft);
>  }
>  
>  /* Initializes the fields in 's' that represent the leader's view of the
> @@ -2413,7 +2423,7 @@ raft_become_leader(struct raft *raft)
>      raft->role = RAFT_LEADER;
>      raft->leader_sid = raft->sid;
>      raft->election_timeout = LLONG_MAX;
> -    raft->ping_timeout = time_msec() + PING_TIME_MSEC;
> +    raft_reset_ping_timer(raft);
>  
>      struct raft_server *s;
>      HMAP_FOR_EACH (s, hmap_node, &raft->servers) {
> @@ -2573,11 +2583,13 @@ raft_get_next_entry(struct raft *raft, struct uuid 
> *eid)
>      return data;
>  }
>  
> -static void
> +/* Updates commit index in raft log. If commit index is already up-to-date
> + * it does nothing and return false, otherwise, returns true. */
> +static bool
>  raft_update_commit_index(struct raft *raft, uint64_t new_commit_index)
>  {
>      if (new_commit_index <= raft->commit_index) {
> -        return;
> +        return false;
>      }
>  
>      if (raft->role == RAFT_LEADER) {
> @@ -2610,6 +2622,7 @@ raft_update_commit_index(struct raft *raft, uint64_t 
> new_commit_index)
>          .commit_index = raft->commit_index,
>      };
>      ignore(ovsdb_log_write_and_free(raft->log, raft_record_to_json(&r)));
> +    return true;
>  }
>  
>  /* This doesn't use rq->entries (but it does use rq->n_entries). */
> @@ -2809,7 +2822,7 @@ raft_handle_append_request(struct raft *raft,
>                                 "usurped leadership");
>          return;
>      }
> -    raft_reset_timer(raft);
> +    raft_reset_election_timer(raft);
>  
>      /* First check for the common case, where the AppendEntries request is
>       * entirely for indexes covered by 'log_start' ... 'log_end - 1', 
> something
> @@ -3045,7 +3058,9 @@ raft_consider_updating_commit_index(struct raft *raft)
>              }
>          }
>      }
> -    raft_update_commit_index(raft, new_commit_index);
> +    if (raft_update_commit_index(raft, new_commit_index)) {
> +        raft_send_heartbeats(raft);
> +    }
>  }
>  
>  static void
> @@ -3274,7 +3289,7 @@ raft_handle_vote_request__(struct raft *raft,
>          return false;
>      }
>  
> -    raft_reset_timer(raft);
> +    raft_reset_election_timer(raft);
>  
>      return true;
>  }
> @@ -3697,7 +3712,7 @@ static bool
>  raft_handle_install_snapshot_request__(
>      struct raft *raft, const struct raft_install_snapshot_request *rq)
>  {
> -    raft_reset_timer(raft);
> +    raft_reset_election_timer(raft);
>  
>      /*
>       * Our behavior here depend on new_log_start in the snapshot compared to
> diff --git a/tests/ovsdb-cluster.at b/tests/ovsdb-cluster.at
> index c7f1e34..5550a19 100644
> --- a/tests/ovsdb-cluster.at
> +++ b/tests/ovsdb-cluster.at
> @@ -131,12 +131,16 @@ ovsdb|WARN|schema: changed 2 columns in 
> 'OVN_Southbound' database from ephemeral
>      done
>      export OVN_SB_DB
>  
> -    n1=10 n2=5
> +    n1=10 n2=5 n3=50
>      echo "starting $n1*$n2 ovn-sbctl processes..."
>      for i in $(seq 0 $(expr $n1 - 1) ); do
>          (for j in $(seq $n2); do
>               : > $i-$j.running
> -             run_as "ovn-sbctl($i-$j)" ovn-sbctl 
> "-vPATTERN:console:ovn-sbctl($i-$j)|%D{%H:%M:%S}|%05N|%c|%p|%m" 
> --log-file=$i-$j.log -vfile -vsyslog:off -vtimeval:off --timeout=120 
> --no-leader-only add SB_Global . external_ids $i-$j=$i-$j
> +             txn="add SB_Global . external_ids $i-$j=$i-$j"
> +             for k in $(seq $n3); do
> +                 txn="$txn -- add SB_Global . external_ids $i-$j-$k=$i-$j-$k"
> +             done
> +             run_as "ovn-sbctl($i-$j)" ovn-sbctl 
> "-vPATTERN:console:ovn-sbctl($i-$j)|%D{%H:%M:%S}|%05N|%c|%p|%m" 
> --log-file=$i-$j.log -vfile -vsyslog:off -vtimeval:off --timeout=120 
> --no-leader-only $txn
>               status=$?
>               if test $status != 0; then
>                   echo "$i-$j exited with status $status" > $i-$j:$status
> @@ -146,13 +150,12 @@ ovsdb|WARN|schema: changed 2 columns in 
> 'OVN_Southbound' database from ephemeral
>           : > $i.done)&
>      done
>      echo "...done"
> -    sleep 2
>  
>      echo "waiting for ovn-sbctl processes to exit..."
>      # Use file instead of var because code inside "while" runs in a subshell.
>      echo 0 > phase
>      i=0
> -    (while :; do echo; sleep 1; done) | while read REPLY; do
> +    (while :; do echo; sleep 0.1; done) | while read REPLY; do
>          printf "t=%2d s:" $i
>          done=0
>          for j in $(seq 0 $(expr $n1 - 1)); do
> @@ -168,7 +171,7 @@ ovsdb|WARN|schema: changed 2 columns in 'OVN_Southbound' 
> database from ephemeral
>  
>          case $(cat phase) in # (
>          0)
> -            if test $done -ge $(expr $n1 / 4); then
> +            if test $done -ge $(expr $n1 / 10); then
>                  if test $variant = kill; then
>                      stop_server $victim
>                  else
> @@ -199,6 +202,9 @@ ovsdb|WARN|schema: changed 2 columns in 'OVN_Southbound' 
> database from ephemeral
>      for i in $(seq 0 $(expr $n1 - 1) ); do
>          for j in `seq $n2`; do
>              echo "$i-$j=$i-$j"
> +            for k in `seq $n3`; do
> +                echo "$i-$j-$k=$i-$j-$k"
> +            done
>          done
>      done | sort > expout
>      AT_CHECK([ovn-sbctl --timeout=30 --log-file=finalize.log -vtimeval:off 
> -vfile -vsyslog:off --bare get SB_Global . external-ids | tr ',' '\n' | sed 
> 's/[[{}"" ]]//g' | sort], [0], [expout])
> 
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to