Hello community, here is the log from the commit of package raft for openSUSE:Factory checked in at 2020-06-03 20:34:55 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/raft (Old) and /work/SRC/openSUSE:Factory/.raft.new.3606 (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "raft" Wed Jun 3 20:34:55 2020 rev:8 rq:811046 version:0.9.22 Changes: -------- --- /work/SRC/openSUSE:Factory/raft/raft.changes 2020-05-13 22:56:04.563008275 +0200 +++ /work/SRC/openSUSE:Factory/.raft.new.3606/raft.changes 2020-06-03 20:35:46.485784255 +0200 @@ -1,0 +2,6 @@ +Wed Jun 3 08:46:43 UTC 2020 - Andreas Stieger <[email protected]> + +- raft 0.9.22: + * various raft protocol fixes + +------------------------------------------------------------------- Old: ---- raft-0.9.19.tar.gz New: ---- raft-0.9.22.tar.gz ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ raft.spec ++++++ --- /var/tmp/diff_new_pack.3Yfdeg/_old 2020-06-03 20:35:48.113789358 +0200 +++ /var/tmp/diff_new_pack.3Yfdeg/_new 2020-06-03 20:35:48.117789370 +0200 @@ -18,7 +18,7 @@ %bcond_without libuv Name: raft -Version: 0.9.19 +Version: 0.9.22 Release: 0 Summary: Fully asynchronous C implementation of the Raft consensus protocol License: LGPL-3.0-only WITH linking-exception-lgpl-3.0 ++++++ raft-0.9.19.tar.gz -> raft-0.9.22.tar.gz ++++++ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/raft-0.9.19/.travis.yml new/raft-0.9.22/.travis.yml --- old/raft-0.9.19/.travis.yml 2020-05-12 16:34:21.000000000 +0200 +++ new/raft-0.9.22/.travis.yml 2020-05-29 11:33:07.000000000 +0200 @@ -1,4 +1,8 @@ language: c +env: + global: + - secure: "T0kfZEvvymfcV4LWvDxafXMYIXlvhgYBWpY8GMKFxx8NBPBZID7wLFEnhiAF/p7gSaW4491mn3aDGA6/l9mJ/2MgXJssBiGh/5Wz+9dUTKH8T176PZI0+hNSkLPI+c/L00sslhhL7ZOF41mQf4VVHUhkIlFtfI9s/5e6iClw3EefSryX4lqmmE7E/aSMuE5fAQYNV48iwwQvZvcWEZ4yg8Cie5nMv6MdFQUZhmnxXZeixQ2JJGx2/nT/AhlvxTNlPXMsS9khWYVCts4DUpdO3qv52Zj1hOB6f7QoqY7qv3RsHFTN+6YsNlgbKCmFV4JDkCZqklQSfiyFB4IqqudH2AGFNhaJw00wb6kk7Kz3l827V36ib7Jgt3jWg+iF6elTrPm/1Friu7VdW2aCxLWtQYVgkW6sl3uWi8W1Uv8nM58vgFzmtJuojts0mfy0Q2sT/2gR/OITIIcfjZZ8X25Dtm9uNh/7wCVAyU19thiTNVQWrykzEZoiJXWkp1TUZQhpT/PpU5ibwumuk2ZgqAzUSYXzxWXk1/qqOkks8bz4LuKqX9uHbePCUHhvJA3DgfR1kajGREjycteoRNvZQyt8l0hP0deCdEVNZe0GY0Ut0dN6EnjuGMszqioF4ozz9Pje4OR/4u8+H1rVFY4VVliVLIawZ+Eusu4rZgl9oSQ8CzY=" + addons: apt: packages: @@ -8,33 +12,54 @@ - btrfs-progs - xfsprogs - zfsutils-linux + + coverity_scan: + build_script_url: https://dl.stgraber.org/coverity_travis.sh + project: + name: canonical/raft + description: "Fully asynchronous C implementation of the Raft consensus protocol" + + # Where email notification of build analysis results will be sent + notification_email: [email protected] + + build_command_prepend: "autoreconf -i && ./configure" + build_command: "make" + branch_pattern: master + jobs: include: - compiler: gcc dist: bionic arch: amd64 + - compiler: gcc dist: xenial arch: amd64 + - if: type != pull_request compiler: clang dist: bionic arch: amd64 + - if: type != pull_request compiler: gcc dist: bionic arch: s390x + - if: type != pull_request compiler: gcc dist: bionic arch: arm64 + - if: type != pull_request compiler: clang dist: bionic arch: ppc64le + before_script: - git clone --depth 1 https://github.com/edlund/amalgamate.git - export PATH=$PATH:$PWD/amalgamate + script: - autoreconf -i - ./configure --enable-example --enable-debug --enable-code-coverage --enable-sanitize @@ -44,5 +69,6 @@ - make check CFLAGS=-O0 $(./test/lib/fs.sh detect) || (cat ./test-suite.log && false) - if [ $TRAVIS_COMPILER = gcc ]; then make code-coverage-capture; fi - ./test/lib/fs.sh teardown + after_success: - bash <(curl -s https://codecov.io/bash) -G "./src*" diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/raft-0.9.19/README.md new/raft-0.9.22/README.md --- old/raft-0.9.19/README.md 2020-05-12 16:34:21.000000000 +0200 +++ new/raft-0.9.22/README.md 2020-05-29 11:33:07.000000000 +0200 @@ -41,6 +41,7 @@ - Writing to leader's disk in parallel - Automatic stepping down when the leader loses quorum - Leadership transfer extension +- Pre-vote protocol Building -------- diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/raft-0.9.19/configure.ac new/raft-0.9.22/configure.ac --- old/raft-0.9.19/configure.ac 2020-05-12 16:34:21.000000000 +0200 +++ new/raft-0.9.22/configure.ac 2020-05-29 11:33:07.000000000 +0200 @@ -1,5 +1,5 @@ AC_PREREQ(2.60) -AC_INIT([raft], [0.9.19]) +AC_INIT([raft], [0.9.22]) AC_LANG([C]) AC_CONFIG_MACRO_DIR([m4]) AC_CONFIG_AUX_DIR([ac]) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/raft-0.9.19/docs/disk-format.rst new/raft-0.9.22/docs/disk-format.rst --- old/raft-0.9.19/docs/disk-format.rst 1970-01-01 01:00:00.000000000 +0100 +++ new/raft-0.9.22/docs/disk-format.rst 2020-05-29 11:33:07.000000000 +0200 @@ -0,0 +1,65 @@ +Disk format +=========== + +The implementation of metadata and log persistency is virtually the same as the +one found in `LogCabin`_. + +The disk files consist of metadata files, closed segments, and open +segments. Metadata files are used to track Raft metadata, such as the server's +current term, vote, and log's start index. Segments contain contiguous entries +that are part of the log. Closed segments are never written to again (but may be +renamed and truncated if a suffix of the log is truncated). Open segments are +where newly appended entries go. Once an open segment reaches the maximum +allowed size, it is closed and a new one is used. There are usually about 3 open +segments at any given time, the one with the lower index is the one actively +being written, and the other ones have been fallocate'd and are ready to be used +as soon as the active one gets closed. + +Metadata files are named "metadata1" and "metadata2". The code alternates +between these so that there is always at least one readable metadata file. + +On startup, the readable metadata file with the higher version number is used. + +The format of a metadata file is: + +* [8 bytes] Format (currently 1). +* [8 bytes] Incremental version number. +* [8 bytes] Current term. +* [8 bytes] ID of server we voted for. + +All values are in little endian encoding. + +Closed segments are named by the format string "%lu-%lu" with their start and +end indexes, both inclusive. Closed segments always contain at least one entry, +and the end index is always at least as large as the start index. Closed segment +files may occasionally include data past their filename's end index (these are +ignored but a warning is logged). This can happen if the suffix of the segment +is truncated and a crash occurs at an inopportune time (the segment file is +first renamed, then truncated, and a crash occurs in between). + +Open segments are named by the format string "open-%lu" with a unique +number. These should not exist when the server shuts down cleanly, but they +exist while the server is running and may be left around during a crash. Open +segments either contain entries which come after the last closed segment or are +full of zeros. When the server crashes while appending to an open segment, the +end of that file may be corrupt. We can't distinguish between a corrupt file and +a partially written entry. The code assumes it's a partially written entry, logs +a warning, and ignores it. + +Truncating a suffix of the log will remove all entries that are no longer part +of the log. Truncating a prefix of the log will only remove complete segments +that are before the new log start index. For example, if a segment has entries +10 through 20 and the prefix of the log is truncated to start at entry 15, that +entire segment will be retained. + +Each segment file starts with a segment header, which currently contains just an +8-byte version number for the format of that segment. The current format +(version 1) is just a concatenation of serialized entry batches. + +Each batch has the following format: + +* [4 bytes] CRC32 checksum of the batch header, little endian. +* [4 bytes] CRC32 checksum of the batch data, little endian. +* [ ... ] Batch of one or more entries. + +.. _LogCabin: https://github.com/logcabin/logcabin/blob/master/Storage/SegmentedLog.h diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/raft-0.9.19/docs/index.rst new/raft-0.9.22/docs/index.rst --- old/raft-0.9.19/docs/index.rst 2020-05-12 16:34:21.000000000 +0200 +++ new/raft-0.9.22/docs/index.rst 2020-05-29 11:33:07.000000000 +0200 @@ -130,3 +130,4 @@ self getting-started api + disk-format diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/raft-0.9.19/example/server.c new/raft-0.9.22/example/server.c --- old/raft-0.9.19/example/server.c 2020-05-12 16:34:21.000000000 +0200 +++ new/raft-0.9.22/example/server.c 2020-05-29 11:33:07.000000000 +0200 @@ -234,6 +234,7 @@ raft_set_snapshot_threshold(&s->raft, 64); raft_set_snapshot_trailing(&s->raft, 16); + raft_set_pre_vote(&s->raft, true); s->transfer.data = s; diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/raft-0.9.19/include/raft.h new/raft-0.9.22/include/raft.h --- old/raft-0.9.19/include/raft.h 2020-05-12 16:34:21.000000000 +0200 +++ new/raft-0.9.22/include/raft.h 2020-05-29 11:33:07.000000000 +0200 @@ -244,6 +244,7 @@ raft_index last_log_index; /* Index of candidate's last log entry. */ raft_index last_log_term; /* Term of log entry at last_log_index. */ bool disrupt_leader; /* True if current leader should be discarded. */ + bool pre_vote; /* True if this is a pre-vote request. */ }; /** @@ -640,6 +641,8 @@ { unsigned randomized_election_timeout; /* Timer expiration. */ bool *votes; /* Vote results. */ + bool disrupt_leader; /* For leadership transfer */ + bool in_pre_vote; /* True in pre-vote phase. */ } candidate_state; struct { @@ -687,6 +690,10 @@ * error occurred. */ char errmsg[RAFT_ERRMSG_BUF_SIZE]; + + /* Whether to use pre-vote to avoid disconnected servers disrupting the + * current leader, as described in 4.2.3 and 9.6. */ + bool pre_vote; }; RAFT_API int raft_init(struct raft *r, @@ -762,6 +769,11 @@ RAFT_API void raft_set_snapshot_threshold(struct raft *r, unsigned n); /** + * Enable or disable pre-vote support. Pre-vote is turned off by default. + */ +RAFT_API void raft_set_pre_vote(struct raft *r, bool enabled); + +/** * Number of outstanding log entries to keep in the log after a snapshot has * been taken. This avoids sending snapshots when a follower is behind by just a * few entries. The default is 128. diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/raft-0.9.19/src/convert.c new/raft-0.9.22/src/convert.c --- old/raft-0.9.19/src/convert.c 2020-05-12 16:34:21.000000000 +0200 +++ new/raft-0.9.22/src/convert.c 2020-05-29 11:33:07.000000000 +0200 @@ -154,6 +154,8 @@ if (r->candidate_state.votes == NULL) { return RAFT_NOMEM; } + r->candidate_state.disrupt_leader = disrupt_leader; + r->candidate_state.in_pre_vote = r->pre_vote; /* Fast-forward to leader if we're the only voting server in the * configuration. */ @@ -167,7 +169,7 @@ } /* Start a new election round */ - rv = electionStart(r, disrupt_leader); + rv = electionStart(r); if (rv != 0) { r->state = RAFT_FOLLOWER; raft_free(r->candidate_state.votes); diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/raft-0.9.19/src/election.c new/raft-0.9.22/src/election.c --- old/raft-0.9.19/src/election.c 2020-05-12 16:34:21.000000000 +0200 +++ new/raft-0.9.22/src/election.c 2020-05-29 11:33:07.000000000 +0200 @@ -60,22 +60,29 @@ } /* Send a RequestVote RPC to the given server. */ -static int electionSend(struct raft *r, - const struct raft_server *server, - bool disrupt_leader) +static int electionSend(struct raft *r, const struct raft_server *server) { struct raft_message message; struct raft_io_send *send; + raft_term term; int rv; assert(server->id != r->id); assert(server->id != 0); + /* If we are in the pre-vote phase, we indicate our future term in the + * request. */ + term = r->current_term; + if (r->candidate_state.in_pre_vote) { + term++; + } + message.type = RAFT_IO_REQUEST_VOTE; - message.request_vote.term = r->current_term; + message.request_vote.term = term; message.request_vote.candidate_id = r->id; message.request_vote.last_log_index = logLastIndex(&r->log); message.request_vote.last_log_term = logLastTerm(&r->log); - message.request_vote.disrupt_leader = disrupt_leader; + message.request_vote.disrupt_leader = r->candidate_state.disrupt_leader; + message.request_vote.pre_vote = r->candidate_state.in_pre_vote; message.server_id = server->id; message.server_address = server->address; @@ -95,7 +102,7 @@ return 0; } -int electionStart(struct raft *r, bool disrupt_leader) +int electionStart(struct raft *r) { raft_term term; size_t n_voters; @@ -117,22 +124,25 @@ assert(n_voters <= r->configuration.n); assert(voting_index < n_voters); - /* Increment current term */ - term = r->current_term + 1; - rv = r->io->set_term(r->io, term); - if (rv != 0) { - goto err; - } + /* During pre-vote we don't actually increment term or persist vote. */ + if (!r->candidate_state.in_pre_vote) { + /* Increment current term */ + term = r->current_term + 1; + rv = r->io->set_term(r->io, term); + if (rv != 0) { + goto err; + } - /* Vote for self */ - rv = r->io->set_vote(r->io, r->id); - if (rv != 0) { - goto err; - } + /* Vote for self */ + rv = r->io->set_vote(r->io, r->id); + if (rv != 0) { + goto err; + } - /* Update our cache too. */ - r->current_term = term; - r->voted_for = r->id; + /* Update our cache too. */ + r->current_term = term; + r->voted_for = r->id; + } /* Reset election timer. */ electionResetTimer(r); @@ -152,10 +162,10 @@ if (server->id == r->id || server->role != RAFT_VOTER) { continue; } - rv = electionSend(r, server, disrupt_leader); + rv = electionSend(r, server); if (rv != 0) { /* This is not a critical failure, let's just log it. */ - tracef("failed to send vote request to server %u: %s", server->id, + tracef("failed to send vote request to server %llu: %s", server->id, raft_strerror(rv)); } } @@ -174,6 +184,7 @@ const struct raft_server *local_server; raft_index local_last_index; raft_term local_last_term; + bool is_transferee; /* Requester is the target of a leadership transfer */ int rv; assert(r != NULL); @@ -189,7 +200,10 @@ return 0; } - if (r->voted_for != 0 && r->voted_for != args->candidate_id) { + is_transferee = + r->transfer != NULL && r->transfer->id == args->candidate_id; + if (r->voted_for != 0 && r->voted_for != args->candidate_id && + !is_transferee) { tracef("local server already voted -> not granting vote"); return 0; } @@ -237,16 +251,18 @@ return 0; grant_vote: - rv = r->io->set_vote(r->io, args->candidate_id); - if (rv != 0) { - return rv; + if (!args->pre_vote) { + rv = r->io->set_vote(r->io, args->candidate_id); + if (rv != 0) { + return rv; + } + r->voted_for = args->candidate_id; + + /* Reset the election timer. */ + r->election_timer_start = r->io->time(r->io); } *granted = true; - r->voted_for = args->candidate_id; - - /* Reset the election timer. */ - r->election_timer_start = r->io->time(r->io); return 0; } diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/raft-0.9.19/src/election.h new/raft-0.9.22/src/election.h --- old/raft-0.9.19/src/election.h 2020-05-12 16:34:21.000000000 +0200 +++ new/raft-0.9.22/src/election.h 2020-05-29 11:33:07.000000000 +0200 @@ -55,10 +55,8 @@ * transitions to candidate state. It then votes for itself and issues * RequestVote RPCs in parallel to each of the other servers in the * cluster. - * - * If the disrupt_leader flag is true, the server will set the disrupt leader - * flag of the RequestVote messages it sends. */ -int electionStart(struct raft *r, bool disrupt_leader); + */ +int electionStart(struct raft *r); /* Decide whether our vote should be granted to the requesting server and update * our state accordingly. diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/raft-0.9.19/src/raft.c new/raft-0.9.22/src/raft.c --- old/raft-0.9.19/src/raft.c 2020-05-12 16:34:21.000000000 +0200 +++ new/raft-0.9.22/src/raft.c 2020-05-29 11:33:07.000000000 +0200 @@ -57,6 +57,7 @@ r->snapshot.put.data = NULL; r->close_cb = NULL; memset(r->errmsg, 0, sizeof r->errmsg); + r->pre_vote = false; rv = r->io->init(r->io, r->id, r->address); if (rv != 0) { ErrMsgTransfer(r->io->errmsg, r->errmsg, "io"); @@ -112,6 +113,11 @@ r->snapshot.trailing = n; } +void raft_set_pre_vote(struct raft *r, bool enabled) +{ + r->pre_vote = enabled; +} + const char *raft_errmsg(struct raft *r) { return r->errmsg; diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/raft-0.9.19/src/recv.c new/raft-0.9.22/src/recv.c --- old/raft-0.9.19/src/recv.c 2020-05-12 16:34:21.000000000 +0200 +++ new/raft-0.9.22/src/recv.c 2020-05-29 11:33:07.000000000 +0200 @@ -111,14 +111,20 @@ } } -/* Bump the current term to the given value and reset our vote, persiting the - * change to disk. */ -static int bumpCurrentTerm(struct raft *r, raft_term term) +int recvBumpCurrentTerm(struct raft *r, raft_term term) { int rv; + char msg[128]; assert(r != NULL); - assert(term >= r->current_term); + assert(term > r->current_term); + + sprintf(msg, "remote term %lld is higher than %lld -> bump local term", + term, r->current_term); + if (r->state != RAFT_FOLLOWER) { + strcat(msg, " and step down"); + } + tracef("%s", msg); /* Save the new term to persistent store, resetting the vote. */ rv = r->io->set_term(r->io, term); @@ -130,9 +136,25 @@ r->current_term = term; r->voted_for = 0; + if (r->state != RAFT_FOLLOWER) { + /* Also convert to follower. */ + convertToFollower(r); + } + return 0; } +void recvCheckMatchingTerms(struct raft *r, raft_term term, int *match) +{ + if (term < r->current_term) { + *match = -1; + } else if (term > r->current_term) { + *match = 1; + } else { + *match = 0; + } +} + int recvEnsureMatchingTerms(struct raft *r, raft_term term, int *match) { int rv; @@ -140,8 +162,9 @@ assert(r != NULL); assert(match != NULL); - if (term < r->current_term) { - *match = -1; + recvCheckMatchingTerms(r, term, match); + + if (*match == -1) { return 0; } @@ -159,25 +182,11 @@ * If a candidate or leader discovers that its term is out of date, it * immediately reverts to follower state. */ - if (term > r->current_term) { - char msg[128]; - sprintf(msg, "remote term %lld is higher than %lld -> bump local term", - term, r->current_term); - if (r->state != RAFT_FOLLOWER) { - strcat(msg, " and step down"); - } - tracef("%s", msg); - rv = bumpCurrentTerm(r, term); + if (*match == 1) { + rv = recvBumpCurrentTerm(r, term); if (rv != 0) { return rv; } - if (r->state != RAFT_FOLLOWER) { - /* Also convert to follower. */ - convertToFollower(r); - } - *match = 1; - } else { - *match = 0; } return 0; diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/raft-0.9.19/src/recv.h new/raft-0.9.22/src/recv.h --- old/raft-0.9.19/src/recv.h 2020-05-12 16:34:21.000000000 +0200 +++ new/raft-0.9.22/src/recv.h 2020-05-29 11:33:07.000000000 +0200 @@ -9,6 +9,17 @@ * receiving an RPC message. */ void recvCb(struct raft_io *io, struct raft_message *message); +/* Compare a request's term with the server's current term. + * + * The match output parameter will be set to 0 if the local term matches the + * request's term, to -1 if the request's term is lower, and to 1 if the + * request's term is higher. */ +void recvCheckMatchingTerms(struct raft *r, raft_term term, int *match); + +/* Bump the current term and possibly step down from candidate or leader + * state. */ +int recvBumpCurrentTerm(struct raft *r, raft_term term); + /* Common logic for RPC handlers, comparing the request's term with the server's * current term and possibly deciding to reject the request or step down from * candidate or leader. diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/raft-0.9.19/src/recv_request_vote.c new/raft-0.9.22/src/recv_request_vote.c --- old/raft-0.9.19/src/recv_request_vote.c 2020-05-12 16:34:21.000000000 +0200 +++ new/raft-0.9.22/src/recv_request_vote.c 2020-05-29 11:33:07.000000000 +0200 @@ -26,6 +26,7 @@ struct raft_io_send *req; struct raft_message message; struct raft_request_vote_result *result = &message.request_vote_result; + bool has_leader; int match; int rv; @@ -47,22 +48,30 @@ * From Section 4.2.3: * * This change conflicts with the leadership transfer mechanism as - * described in Chapter 3, in whicha server legitimately starts an + * described in Chapter 3, in which a server legitimately starts an * election without waiting an election timeout. In that case, RequestVote * messages should be processed by other servers even when they believe a - * current cluster leader exists.Those RequestVote requests can include a + * current cluster leader exists. Those RequestVote requests can include a * special flag to indicate this behavior ("I have permission to disrupt * the leader - it told me to!"). */ - if (r->state == RAFT_FOLLOWER && r->follower_state.current_leader.id != 0 && - !args->disrupt_leader) { + has_leader = + r->state == RAFT_LEADER || + (r->state == RAFT_FOLLOWER && r->follower_state.current_leader.id != 0); + if (has_leader && !args->disrupt_leader) { tracef("local server has a leader -> reject "); goto reply; } - rv = recvEnsureMatchingTerms(r, args->term, &match); - if (rv != 0) { - return rv; + /* If this is a pre-vote request, don't actually increment our term or + * persist the vote. */ + if (args->pre_vote) { + recvCheckMatchingTerms(r, args->term, &match); + } else { + rv = recvEnsureMatchingTerms(r, args->term, &match); + if (rv != 0) { + return rv; + } } /* From Figure 3.1: @@ -76,9 +85,12 @@ goto reply; } - /* At this point our term must be the same as the request term (otherwise we - * would have rejected the request or bumped our term). */ - assert(r->current_term == args->term); + /* Unless this is a pre-vote request, at this point our term must be the + * same as the request term (otherwise we would have rejected the request or + * bumped our term). */ + if (!args->pre_vote) { + assert(r->current_term == args->term); + } rv = electionVote(r, args, &result->vote_granted); if (rv != 0) { diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/raft-0.9.19/src/recv_request_vote_result.c new/raft-0.9.22/src/recv_request_vote_result.c --- old/raft-0.9.19/src/recv_request_vote_result.c 2020-05-12 16:34:21.000000000 +0200 +++ new/raft-0.9.22/src/recv_request_vote_result.c 2020-05-29 11:33:07.000000000 +0200 @@ -41,9 +41,17 @@ return 0; } - rv = recvEnsureMatchingTerms(r, result->term, &match); - if (rv != 0) { - return rv; + /* If we're in the pre-vote phase, don't actually increment our term right + * now (we'll do it later, if we start the second phase), and also don't + * step down if the peer is just one term ahead (this is okay as in the + * request we sent our current term plus one). */ + if (r->candidate_state.in_pre_vote) { + recvCheckMatchingTerms(r, result->term, &match); + } else { + rv = recvEnsureMatchingTerms(r, result->term, &match); + if (rv != 0) { + return rv; + } } if (match < 0) { @@ -54,7 +62,23 @@ return 0; } - assert(result->term == r->current_term); + /* If we're in the pre-vote phase, check that the peer's is at most one term + * ahead (possibly stepping down). If we're the actual voting phase, we + * expect our term must to be the same as the response term (otherwise we + * would have either ignored the result bumped our term). */ + if (r->candidate_state.in_pre_vote) { + if (match > 0) { + if (result->term > r->current_term + 1) { + assert(!result->vote_granted); + rv = recvBumpCurrentTerm(r, result->term); + if (rv != 0) { + return rv; + } + } + } + } else { + assert(result->term == r->current_term); + } /* If the vote was granted and we reached quorum, convert to leader. * @@ -76,13 +100,22 @@ */ if (result->vote_granted) { if (electionTally(r, votes_index)) { - tracef("votes quorum reached -> convert to leader"); - rv = convertToLeader(r); - if (rv != 0) { - return rv; + if (r->candidate_state.in_pre_vote) { + tracef("votes quorum reached -> pre-vote successful"); + r->candidate_state.in_pre_vote = false; + rv = electionStart(r); + if (rv != 0) { + return rv; + } + } else { + tracef("votes quorum reached -> convert to leader"); + rv = convertToLeader(r); + if (rv != 0) { + return rv; + } + /* Send initial heartbeat. */ + replicationHeartbeat(r); } - /* Send initial heartbeat. */ - replicationHeartbeat(r); } else { tracef("votes quorum not reached"); } diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/raft-0.9.19/src/tick.c new/raft-0.9.22/src/tick.c --- old/raft-0.9.19/src/tick.c 2020-05-12 16:34:21.000000000 +0200 +++ new/raft-0.9.22/src/tick.c 2020-05-29 11:33:07.000000000 +0200 @@ -79,7 +79,7 @@ */ if (electionTimerExpired(r)) { tracef("start new election"); - return electionStart(r, false); + return electionStart(r); } return 0; diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/raft-0.9.19/src/uv_encoding.c new/raft-0.9.22/src/uv_encoding.c --- old/raft-0.9.19/src/uv_encoding.c 2020-05-12 16:34:21.000000000 +0200 +++ new/raft-0.9.22/src/uv_encoding.c 2020-05-29 11:33:07.000000000 +0200 @@ -80,12 +80,20 @@ static void encodeRequestVote(const struct raft_request_vote *p, void *buf) { void *cursor = buf; + uint64_t flags = 0; + + if (p->disrupt_leader) { + flags |= 1 << 0; + } + if (p->pre_vote) { + flags |= 1 << 1; + } bytePut64(&cursor, p->term); bytePut64(&cursor, p->candidate_id); bytePut64(&cursor, p->last_log_index); bytePut64(&cursor, p->last_log_term); - bytePut64(&cursor, p->disrupt_leader ? 1 : 0); + bytePut64(&cursor, flags); } static void encodeRequestVoteResult(const struct raft_request_vote_result *p, @@ -149,7 +157,6 @@ bytePut64(&cursor, p->last_log_term); } - int uvEncodeMessage(const struct raft_message *message, uv_buf_t **bufs, unsigned *n_bufs) @@ -299,8 +306,11 @@ /* Support for legacy request vote that doesn't have disrupt_leader. */ if (buf->len == sizeofRequestVoteV1()) { p->disrupt_leader = false; + p->pre_vote = false; } else { - p->disrupt_leader = byteGet64(&cursor) == 1; + uint64_t flags = byteGet64(&cursor); + p->disrupt_leader = (bool)(flags & 1 << 0); + p->pre_vote = (bool)(flags & 1 << 1); } } diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/raft-0.9.19/src/uv_os.c new/raft-0.9.22/src/uv_os.c --- old/raft-0.9.19/src/uv_os.c 2020-05-12 16:34:21.000000000 +0200 +++ new/raft-0.9.22/src/uv_os.c 2020-05-29 11:33:07.000000000 +0200 @@ -40,7 +40,7 @@ /* Emulate fallocate(). Mostly taken from glibc's implementation. */ static int uvOsFallocateEmulation(int fd, off_t offset, off_t len) { - unsigned increment; + ssize_t increment; struct statfs f; int rv; @@ -59,7 +59,7 @@ for (offset += (len - 1) % increment; len > 0; offset += increment) { len -= increment; - rv = pwrite(fd, "", 1, offset); + rv = (int)pwrite(fd, "", 1, offset); if (rv != 1) return errno; } diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/raft-0.9.19/test/integration/test_election.c new/raft-0.9.22/test/integration/test_election.c --- old/raft-0.9.19/test/integration/test_election.c 2020-05-12 16:34:21.000000000 +0200 +++ new/raft-0.9.22/test/integration/test_election.c 2020-05-29 11:33:07.000000000 +0200 @@ -575,3 +575,44 @@ return MUNIT_OK; } + +/* Test an election round with two voters and pre-vote. */ +TEST(election, preVote, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + raft_set_pre_vote(CLUSTER_RAFT(0), true); + raft_set_pre_vote(CLUSTER_RAFT(1), true); + CLUSTER_START; + + /* The first server eventually times out and converts to candidate, but it + * does not increment its term yet.*/ + STEP_UNTIL_CANDIDATE(0); + ASSERT_TIME(1000); + ASSERT_TERM(0, 1); + + CLUSTER_STEP; /* Server 1 tick */ + ASSERT_FOLLOWER(1); + + CLUSTER_STEP; /* Server 0 completes sending a pre-vote RequestVote RPC */ + CLUSTER_STEP; /* Server 1 receives the pre-vote RequestVote RPC */ + ASSERT_TERM(1, 1); /* Server 1 does increment its term */ + ASSERT_VOTED_FOR(1, 0); /* Server 1 does not persist its vote */ + ASSERT_TIME(1015); + + CLUSTER_STEP; /* Server 1 completes sending pre-vote RequestVote result */ + CLUSTER_STEP; /* Server 0 receives the pre-vote RequestVote result */ + ASSERT_CANDIDATE(0); + ASSERT_TERM(0, 2); /* Server 0 has now incremented its term. */ + ASSERT_TIME(1030); + + CLUSTER_STEP; /* Server 1 completes sending an actual RequestVote RPC */ + CLUSTER_STEP; /* Server 1 receives the actual RequestVote RPC */ + ASSERT_TERM(1, 2); /* Server 1 does increment its term. */ + ASSERT_VOTED_FOR(1, 1); /* Server 1 does persists its vote */ + + CLUSTER_STEP; /* Server 1 completes sending actual RequestVote result */ + CLUSTER_STEP; /* Server 0 receives the actual RequestVote result */ + ASSERT_LEADER(0); + + return MUNIT_OK; +}
