This is an automated email from the ASF dual-hosted git repository.
williamsong pushed a commit to branch feature/leaderlease
in repository https://gitbox.apache.org/repos/asf/ratis.git
The following commit(s) were added to refs/heads/feature/leaderlease by this
push:
new a13d81e9a RATIS-1866. Maintain leader lease after AppendEntries (#898)
a13d81e9a is described below
commit a13d81e9a5720b39c5fea2fd59228fdc4b0b7691
Author: William Song <[email protected]>
AuthorDate: Fri Sep 22 09:54:17 2023 +0800
RATIS-1866. Maintain leader lease after AppendEntries (#898)
---
.../main/java/org/apache/ratis/util/Timestamp.java | 5 ++
.../apache/ratis/grpc/server/GrpcLogAppender.java | 11 ++-
.../apache/ratis/server/leader/FollowerInfo.java | 6 ++
.../ratis/server/impl/DivisionPropertiesImpl.java | 11 ---
.../apache/ratis/server/impl/FollowerInfoImpl.java | 12 +++
.../org/apache/ratis/server/impl/LeaderLease.java | 94 ++++++++++++++++++++++
.../apache/ratis/server/impl/LeaderStateImpl.java | 19 +++++
.../ratis/server/leader/LogAppenderDefault.java | 3 +
.../ratis/server/impl/LeaderElectionTests.java | 82 +++++++++++++++++++
.../ratis/server/impl/RaftServerTestUtil.java | 6 ++
10 files changed, 236 insertions(+), 13 deletions(-)
diff --git a/ratis-common/src/main/java/org/apache/ratis/util/Timestamp.java
b/ratis-common/src/main/java/org/apache/ratis/util/Timestamp.java
index ba5fb8c5b..cff143eef 100644
--- a/ratis-common/src/main/java/org/apache/ratis/util/Timestamp.java
+++ b/ratis-common/src/main/java/org/apache/ratis/util/Timestamp.java
@@ -51,6 +51,11 @@ public final class Timestamp implements
Comparable<Timestamp> {
return a.compareTo(b) > 0? a: b;
}
+ /** @return the earliest timestamp. */
+ public static Timestamp earliest(Timestamp a, Timestamp b) {
+ return a.compareTo(b) > 0? b: a;
+ }
+
private final long nanos;
private Timestamp(long nanos) {
diff --git
a/ratis-grpc/src/main/java/org/apache/ratis/grpc/server/GrpcLogAppender.java
b/ratis-grpc/src/main/java/org/apache/ratis/grpc/server/GrpcLogAppender.java
index eb3195326..348b5310b 100644
--- a/ratis-grpc/src/main/java/org/apache/ratis/grpc/server/GrpcLogAppender.java
+++ b/ratis-grpc/src/main/java/org/apache/ratis/grpc/server/GrpcLogAppender.java
@@ -390,7 +390,9 @@ public class GrpcLogAppender extends LogAppenderBase {
AppendEntriesRequest request = pendingRequests.remove(reply);
if (request != null) {
request.stopRequestTimer(); // Update completion time
+
getFollower().updateLastRespondedAppendEntriesSendTime(request.getSendTime());
}
+ getFollower().updateLastRpcResponseTime();
if (LOG.isDebugEnabled()) {
LOG.debug("{}: received {} reply {}, request={}",
@@ -407,8 +409,6 @@ public class GrpcLogAppender extends LogAppenderBase {
}
private void onNextImpl(AppendEntriesReplyProto reply) {
- // update the last rpc time
- getFollower().updateLastRpcResponseTime();
errCount.set(0);
if (!firstResponseReceived) {
@@ -770,6 +770,8 @@ public class GrpcLogAppender extends LogAppenderBase {
private final TermIndex lastEntry;
+ private volatile Timestamp sendTime;
+
AppendEntriesRequest(AppendEntriesRequestProto proto, RaftPeerId
followerId, GrpcServerMetrics grpcServerMetrics) {
this.callId = proto.getServerRequest().getCallId();
this.previousLog = proto.hasPreviousLog()?
TermIndex.valueOf(proto.getPreviousLog()): null;
@@ -788,8 +790,13 @@ public class GrpcLogAppender extends LogAppenderBase {
return previousLog;
}
+ public Timestamp getSendTime() {
+ return sendTime;
+ }
+
void startRequestTimer() {
timerContext = timer.time();
+ sendTime = Timestamp.currentTime();
}
void stopRequestTimer() {
diff --git
a/ratis-server-api/src/main/java/org/apache/ratis/server/leader/FollowerInfo.java
b/ratis-server-api/src/main/java/org/apache/ratis/server/leader/FollowerInfo.java
index 1dd4066e8..9d5c891d9 100644
---
a/ratis-server-api/src/main/java/org/apache/ratis/server/leader/FollowerInfo.java
+++
b/ratis-server-api/src/main/java/org/apache/ratis/server/leader/FollowerInfo.java
@@ -101,4 +101,10 @@ public interface FollowerInfo {
/** @return the latest heartbeat send time. */
Timestamp getLastHeartbeatSendTime();
+
+ /** @return the send time of last responded rpc */
+ Timestamp getLastRespondedAppendEntriesSendTime();
+
+ /** Update lastRpcResponseTime and LastRespondedAppendEntriesSendTime */
+ void updateLastRespondedAppendEntriesSendTime(Timestamp sendTime);
}
diff --git
a/ratis-server/src/main/java/org/apache/ratis/server/impl/DivisionPropertiesImpl.java
b/ratis-server/src/main/java/org/apache/ratis/server/impl/DivisionPropertiesImpl.java
index b3e8dd94d..63cbc02ed 100644
---
a/ratis-server/src/main/java/org/apache/ratis/server/impl/DivisionPropertiesImpl.java
+++
b/ratis-server/src/main/java/org/apache/ratis/server/impl/DivisionPropertiesImpl.java
@@ -28,7 +28,6 @@ class DivisionPropertiesImpl implements DivisionProperties {
private final TimeDuration rpcTimeoutMax;
private final TimeDuration rpcSleepTime;
private final TimeDuration rpcSlownessTimeout;
- private final TimeDuration leaderLeaseTimeout;
DivisionPropertiesImpl(RaftProperties properties) {
this.rpcTimeoutMin = RaftServerConfigKeys.Rpc.timeoutMin(properties);
@@ -36,11 +35,6 @@ class DivisionPropertiesImpl implements DivisionProperties {
Preconditions.assertTrue(rpcTimeoutMax.compareTo(rpcTimeoutMin) >= 0,
"rpcTimeoutMax = %s < rpcTimeoutMin = %s", rpcTimeoutMax,
rpcTimeoutMin);
- final double leaderLeaseTimeoutRatio =
RaftServerConfigKeys.Read.leaderLeaseTimeoutRatio(properties);
- this.leaderLeaseTimeout =
this.rpcTimeoutMin.multiply(leaderLeaseTimeoutRatio);
- Preconditions.assertTrue(rpcTimeoutMin.compareTo(leaderLeaseTimeout) >= 0,
- "rpcTimeoutMin = %s < leaderLeaseTimeout = %s", rpcTimeoutMin,
leaderLeaseTimeout);
-
this.rpcSleepTime = RaftServerConfigKeys.Rpc.sleepTime(properties);
this.rpcSlownessTimeout =
RaftServerConfigKeys.Rpc.slownessTimeout(properties);
}
@@ -55,11 +49,6 @@ class DivisionPropertiesImpl implements DivisionProperties {
return rpcTimeoutMax;
}
- /** @return the ratio of leader lease timeout */
- public TimeDuration leaderLeaseTimeout() {
- return leaderLeaseTimeout;
- }
-
@Override
public TimeDuration rpcSleepTime() {
return rpcSleepTime;
diff --git
a/ratis-server/src/main/java/org/apache/ratis/server/impl/FollowerInfoImpl.java
b/ratis-server/src/main/java/org/apache/ratis/server/impl/FollowerInfoImpl.java
index 245cbc888..91ab90a20 100644
---
a/ratis-server/src/main/java/org/apache/ratis/server/impl/FollowerInfoImpl.java
+++
b/ratis-server/src/main/java/org/apache/ratis/server/impl/FollowerInfoImpl.java
@@ -39,6 +39,7 @@ class FollowerInfoImpl implements FollowerInfo {
private final AtomicReference<Timestamp> lastRpcResponseTime;
private final AtomicReference<Timestamp> lastRpcSendTime;
private final AtomicReference<Timestamp> lastHeartbeatSendTime;
+ private final AtomicReference<Timestamp> lastRespondedAppendEntriesSendTime;
private final RaftLogIndex nextIndex;
private final RaftLogIndex matchIndex = new RaftLogIndex("matchIndex",
RaftLog.INVALID_LOG_INDEX);
private final RaftLogIndex commitIndex = new RaftLogIndex("commitIndex",
RaftLog.INVALID_LOG_INDEX);
@@ -57,6 +58,7 @@ class FollowerInfoImpl implements FollowerInfo {
this.lastRpcResponseTime = new AtomicReference<>(lastRpcTime);
this.lastRpcSendTime = new AtomicReference<>(lastRpcTime);
this.lastHeartbeatSendTime = new AtomicReference<>(lastRpcTime);
+ this.lastRespondedAppendEntriesSendTime = new
AtomicReference<>(lastRpcTime);
this.nextIndex = new RaftLogIndex("nextIndex", nextIndex);
this.caughtUp = caughtUp;
}
@@ -202,4 +204,14 @@ class FollowerInfoImpl implements FollowerInfo {
public Timestamp getLastHeartbeatSendTime() {
return lastHeartbeatSendTime.get();
}
+
+ @Override
+ public Timestamp getLastRespondedAppendEntriesSendTime() {
+ return lastRespondedAppendEntriesSendTime.get();
+ }
+
+ @Override
+ public void updateLastRespondedAppendEntriesSendTime(Timestamp sendTime) {
+ lastRespondedAppendEntriesSendTime.set(sendTime);
+ }
}
diff --git
a/ratis-server/src/main/java/org/apache/ratis/server/impl/LeaderLease.java
b/ratis-server/src/main/java/org/apache/ratis/server/impl/LeaderLease.java
new file mode 100644
index 000000000..758cbb6c5
--- /dev/null
+++ b/ratis-server/src/main/java/org/apache/ratis/server/impl/LeaderLease.java
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.ratis.server.impl;
+
+import org.apache.ratis.conf.RaftProperties;
+import org.apache.ratis.protocol.RaftPeerId;
+import org.apache.ratis.server.RaftServerConfigKeys;
+import org.apache.ratis.server.leader.FollowerInfo;
+import org.apache.ratis.util.Preconditions;
+import org.apache.ratis.util.Timestamp;
+
+import java.util.List;
+import java.util.Optional;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicReference;
+import java.util.function.Predicate;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+class LeaderLease {
+
+ private final long leaseTimeoutMs;
+ // TODO invalidate leader lease when stepDown / transferLeader
+ private final AtomicReference<Timestamp> lease = new
AtomicReference<>(Timestamp.currentTime());
+
+ LeaderLease(RaftProperties properties) {
+ final double leaseRatio =
RaftServerConfigKeys.Read.leaderLeaseTimeoutRatio(properties);
+ Preconditions.assertTrue(leaseRatio > 0.0 && leaseRatio <= 1.0,
+ "leader ratio should sit in (0,1], now is " + leaseRatio);
+ this.leaseTimeoutMs = RaftServerConfigKeys.Rpc.timeoutMin(properties)
+ .multiply(leaseRatio)
+ .toIntExact(TimeUnit.MILLISECONDS);
+ }
+
+ boolean isValid() {
+ return lease.get().elapsedTimeMs() < leaseTimeoutMs;
+ }
+
+ /**
+ * try extending the lease based on group heartbeats
+ * @param old nullable
+ */
+ void extend(List<FollowerInfo> current, List<FollowerInfo> old,
Predicate<List<RaftPeerId>> hasMajority) {
+ final List<RaftPeerId> activePeers =
+ // check the latest heartbeats of all peers (including those in
transitional)
+ Stream.concat(current.stream(),
Optional.ofNullable(old).map(List::stream).orElse(Stream.empty()))
+ .filter(f ->
f.getLastRespondedAppendEntriesSendTime().elapsedTimeMs() < leaseTimeoutMs)
+ .map(FollowerInfo::getId)
+ .collect(Collectors.toList());
+
+ if (!hasMajority.test(activePeers)) {
+ return;
+ }
+
+ // update the new lease
+ final Timestamp newLease =
+ Timestamp.earliest(getMaxTimestampWithMajorityAck(current),
getMaxTimestampWithMajorityAck(old));
+ lease.set(newLease);
+ }
+
+ /**
+ * return maximum timestamp at when the majority of followers are known to
be active
+ * return {@link Timestamp#currentTime()} if peers are empty
+ */
+ private Timestamp getMaxTimestampWithMajorityAck(List<FollowerInfo>
followers) {
+ if (followers == null || followers.isEmpty()) {
+ return Timestamp.currentTime();
+ }
+
+ final int mid = followers.size() / 2;
+ return followers.stream()
+ .map(FollowerInfo::getLastRespondedAppendEntriesSendTime)
+ .sorted()
+ .limit(mid+1)
+ .skip(mid)
+ .iterator()
+ .next();
+ }
+}
diff --git
a/ratis-server/src/main/java/org/apache/ratis/server/impl/LeaderStateImpl.java
b/ratis-server/src/main/java/org/apache/ratis/server/impl/LeaderStateImpl.java
index 5156585f8..418139378 100644
---
a/ratis-server/src/main/java/org/apache/ratis/server/impl/LeaderStateImpl.java
+++
b/ratis-server/src/main/java/org/apache/ratis/server/impl/LeaderStateImpl.java
@@ -348,6 +348,7 @@ class LeaderStateImpl implements LeaderState {
private final PendingStepDown pendingStepDown;
private final ReadIndexHeartbeats readIndexHeartbeats;
+ private final LeaderLease lease;
LeaderStateImpl(RaftServerImpl server) {
this.name = server.getMemberId() + "-" +
JavaUtils.getClassSimpleName(getClass());
@@ -369,6 +370,7 @@ class LeaderStateImpl implements LeaderState {
this.messageStreamRequests = new
MessageStreamRequests(server.getMemberId());
this.pendingStepDown = new PendingStepDown(this);
this.readIndexHeartbeats = new ReadIndexHeartbeats();
+ this.lease = new LeaderLease(properties);
long maxPendingRequests =
RaftServerConfigKeys.Write.elementLimit(properties);
double followerGapRatioMax =
RaftServerConfigKeys.Write.followerGapRatioMax(properties);
@@ -1127,6 +1129,23 @@ class LeaderStateImpl implements LeaderState {
readIndexHeartbeats.onAppendEntriesReply(appender, reply,
this::hasMajority);
}
+ boolean hasLease() {
+ if (checkLeaderLease()) {
+ return true;
+ }
+
+ // try extending the leader lease
+ final RaftConfigurationImpl conf = server.getRaftConf();
+ final CurrentOldFollowerInfos info =
followerInfoMap.getFollowerInfos(conf);
+ lease.extend(info.getCurrent(), info.getOld(), peers ->
conf.hasMajority(peers, server.getId()));
+
+ return checkLeaderLease();
+ }
+
+ private boolean checkLeaderLease() {
+ return isReady() && (server.getRaftConf().isSingleton() ||
lease.isValid());
+ }
+
void replyPendingRequest(long logIndex, RaftClientReply reply) {
pendingRequests.replyPendingRequest(logIndex, reply);
}
diff --git
a/ratis-server/src/main/java/org/apache/ratis/server/leader/LogAppenderDefault.java
b/ratis-server/src/main/java/org/apache/ratis/server/leader/LogAppenderDefault.java
index 8f71f91fc..6f38f5009 100644
---
a/ratis-server/src/main/java/org/apache/ratis/server/leader/LogAppenderDefault.java
+++
b/ratis-server/src/main/java/org/apache/ratis/server/leader/LogAppenderDefault.java
@@ -26,6 +26,7 @@ import org.apache.ratis.server.RaftServer;
import org.apache.ratis.server.raftlog.RaftLogIOException;
import org.apache.ratis.server.util.ServerStringUtils;
import org.apache.ratis.statemachine.SnapshotInfo;
+import org.apache.ratis.util.Timestamp;
import java.io.IOException;
import java.io.InterruptedIOException;
@@ -73,9 +74,11 @@ class LogAppenderDefault extends LogAppenderBase {
}
resetHeartbeatTrigger();
+ final Timestamp sendTime = Timestamp.currentTime();
getFollower().updateLastRpcSendTime(request.getEntriesCount() == 0);
final AppendEntriesReplyProto r =
getServerRpc().appendEntries(request);
getFollower().updateLastRpcResponseTime();
+ getFollower().updateLastRespondedAppendEntriesSendTime(sendTime);
getLeaderState().onFollowerCommitIndex(getFollower(),
r.getFollowerCommit());
return r;
diff --git
a/ratis-server/src/test/java/org/apache/ratis/server/impl/LeaderElectionTests.java
b/ratis-server/src/test/java/org/apache/ratis/server/impl/LeaderElectionTests.java
index 9e2b7bd2d..c2e5cbd1c 100644
---
a/ratis-server/src/test/java/org/apache/ratis/server/impl/LeaderElectionTests.java
+++
b/ratis-server/src/test/java/org/apache/ratis/server/impl/LeaderElectionTests.java
@@ -43,6 +43,7 @@ import org.apache.ratis.util.LifeCycle;
import org.apache.ratis.util.Slf4jUtils;
import org.apache.ratis.util.TimeDuration;
import org.apache.ratis.util.Timestamp;
+import org.apache.ratis.util.function.CheckedBiConsumer;
import org.junit.Assert;
import org.junit.Test;
import org.slf4j.Logger;
@@ -640,6 +641,87 @@ public abstract class LeaderElectionTests<CLUSTER extends
MiniRaftCluster>
}
}
+ private void runLeaseTest(CLUSTER cluster, CheckedBiConsumer<CLUSTER, Long,
Exception> testCase) throws Exception {
+ final double leaseRatio =
RaftServerConfigKeys.Read.leaderLeaseTimeoutRatio(getProperties());
+ final long leaseTimeoutMs =
RaftServerConfigKeys.Rpc.timeoutMin(getProperties())
+ .multiply(leaseRatio)
+ .toIntExact(TimeUnit.MILLISECONDS);
+ testCase.accept(cluster, leaseTimeoutMs);
+ }
+
+ @Test
+ public void testLeaderLease() throws Exception {
+ // use a strict lease
+ RaftServerConfigKeys.Read.setLeaderLeaseTimeoutRatio(getProperties(), 0.5);
+ runWithNewCluster(3, c -> runLeaseTest(c, this::runTestLeaderLease));
+ }
+
+ void runTestLeaderLease(CLUSTER cluster, long leaseTimeoutMs) throws
Exception {
+ final RaftServer.Division leader = RaftTestUtil.waitForLeader(cluster);
+ try (final RaftClient client = cluster.createClient(leader.getId())) {
+ client.io().send(new RaftTestUtil.SimpleMessage("message"));
+
+ Assert.assertTrue(leader.getInfo().isLeader());
+ Assert.assertTrue(leader.getInfo().isLeaderReady());
+ RaftServerTestUtil.assertLeaderLease(leader, true);
+
+ isolate(cluster, leader.getId());
+ Thread.sleep(leaseTimeoutMs);
+
+ Assert.assertTrue(leader.getInfo().isLeader());
+ Assert.assertTrue(leader.getInfo().isLeaderReady());
+ RaftServerTestUtil.assertLeaderLease(leader, false);
+ } finally {
+ deIsolate(cluster, leader.getId());
+ }
+ }
+
+ @Test
+ public void testLeaderLeaseDuringReconfiguration() throws Exception {
+ // use a strict lease
+ RaftServerConfigKeys.Read.setLeaderLeaseTimeoutRatio(getProperties(), 0.5);
+ runWithNewCluster(3, c -> runLeaseTest(c,
this::runTestLeaderLeaseDuringReconfiguration));
+ }
+
+ void runTestLeaderLeaseDuringReconfiguration(CLUSTER cluster, long
leaseTimeoutMs) throws Exception {
+ final RaftServer.Division leader = RaftTestUtil.waitForLeader(cluster);
+ try (final RaftClient client = cluster.createClient(leader.getId())) {
+ client.io().send(new RaftTestUtil.SimpleMessage("message"));
+
+ Assert.assertTrue(leader.getInfo().isLeader());
+ Assert.assertTrue(leader.getInfo().isLeaderReady());
+ RaftServerTestUtil.assertLeaderLease(leader, true);
+
+ final List<RaftServer.Division> followers = cluster.getFollowers();
+ final MiniRaftCluster.PeerChanges changes = cluster.addNewPeers(2, true);
+
+ // blocking the original 2 followers
+
BlockRequestHandlingInjection.getInstance().blockReplier(followers.get(0).getId().toString());
+
BlockRequestHandlingInjection.getInstance().blockReplier(followers.get(1).getId().toString());
+
+ // start reconfiguration in another thread, shall fail eventually
+ new Thread(() -> {
+ try {
+ client.admin().setConfiguration(changes.allPeersInNewConf);
+ } catch (IOException e) {
+ System.out.println("as expected: " + e.getMessage());
+ }
+ }).start();
+
+ Thread.sleep(leaseTimeoutMs);
+
+ Assert.assertTrue(leader.getInfo().isLeader());
+ Assert.assertTrue(leader.getInfo().isLeaderReady());
+ RaftServerTestUtil.assertLeaderLease(leader, false);
+
+ } finally {
+ BlockRequestHandlingInjection.getInstance().unblockAll();
+ }
+ }
+
+
+
+
private static RaftServerImpl createMockServer(boolean alive) {
final DivisionInfo info = mock(DivisionInfo.class);
when(info.isAlive()).thenReturn(alive);
diff --git
a/ratis-server/src/test/java/org/apache/ratis/server/impl/RaftServerTestUtil.java
b/ratis-server/src/test/java/org/apache/ratis/server/impl/RaftServerTestUtil.java
index 618e398b3..958c19442 100644
---
a/ratis-server/src/test/java/org/apache/ratis/server/impl/RaftServerTestUtil.java
+++
b/ratis-server/src/test/java/org/apache/ratis/server/impl/RaftServerTestUtil.java
@@ -147,6 +147,12 @@ public class RaftServerTestUtil {
return
getLeaderState(server).map(LeaderStateImpl::getLogAppenders).orElse(null);
}
+ public static void assertLeaderLease(RaftServer.Division leader, boolean
hasLease) {
+ final LeaderStateImpl l = getLeaderState(leader).orElse(null);
+ Assert.assertNotNull(l);
+ Assert.assertEquals(l.hasLease(), hasLease);
+ }
+
public static void restartLogAppenders(RaftServer.Division server) {
final LeaderStateImpl leaderState = getLeaderState(server).orElseThrow(
() -> new IllegalStateException(server + " is not the leader"));