GlenGeng commented on a change in pull request #150: URL: https://github.com/apache/incubator-ratis/pull/150#discussion_r462238086
########## File path: ratis-server/src/main/java/org/apache/ratis/server/impl/LeaderState.java ########## @@ -792,6 +794,47 @@ private void replicateNewConf() { return lists; } + /** + * See the thesis section 6.2: A leader in Raft steps down + * if an election timeout elapses without a successful + * round of heartbeats to a majority of its cluster. + */ + private void checkLeadership() { + // The initial value of lastRpcResponseTime in FollowerInfo is set by + // LeaderState::addSenders(), which is fake and used to trigger an + // immediate round of AppendEntries request. Since candidates collect + // votes from majority before becoming leader, without seeing higher term, + // ideally, A leader is legal for election timeout if become leader soon. + if (server.getRole().getRoleElapsedTimeMs() < server.getMaxTimeoutMs()) { + return; + } + + final List<RaftPeerId> activePeers = senders.stream() + .filter(sender -> sender.getFollower() + .getLastRpcResponseTime() + .elapsedTimeMs() <= server.getMaxTimeoutMs()) + .map(sender -> sender.getFollower().getPeer().getId()) + .collect(Collectors.toList()); + + final RaftConfiguration conf = server.getRaftConf(); + + if (conf.hasMajority(activePeers, server.getId())) { + // leadership check passed + return; + } + + List<FollowerInfo> followers = senders.stream() + .map(LogAppender::getFollower).collect(Collectors.toList()); + + LOG.warn(this + ": Lost leadership on term: " + currentTerm + + ". Election timeout: " + server.getMaxTimeoutMs() + "ms" + + ". In charge for: " + server.getRole().getRoleElapsedTimeMs() + "ms" + + ". Conf: " + conf + ". Followers: " + followers); + + // become follower of next term. + stepDown(currentTerm + 1); Review comment: ok ########## File path: ratis-server/src/test/java/org/apache/ratis/RaftAsyncTests.java ########## @@ -402,6 +407,46 @@ void runTestAppendEntriesTimeout(CLUSTER cluster) throws Exception { RaftServerConfigKeys.RetryCache.setExpiryTime(getProperties(), oldExpiryTime); } + @Test + public void testCheckLeadershipFailure() throws Exception { + runWithNewCluster(NUM_SERVERS, this::runTestCheckLeadershipFailure); + } + + void runTestCheckLeadershipFailure(CLUSTER cluster) throws Exception { Review comment: "This test can also pass when followers turn to candidate state and trigger an election." We don't need to worry about this, the `outstandingOp` will prevent follower to timeout. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org