This is an automated email from the ASF dual-hosted git repository.

szetszwo pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ratis.git


The following commit(s) were added to refs/heads/master by this push:
     new e2c867da5 RATIS-2345. Leader stepDown could cause a deadlock. (#1300)
e2c867da5 is described below

commit e2c867da55f8caf789db5e759ffebc2e79a4961d
Author: Tsz-Wo Nicholas Sze <[email protected]>
AuthorDate: Tue Oct 14 08:18:01 2025 -0700

    RATIS-2345. Leader stepDown could cause a deadlock. (#1300)
---
 .../java/org/apache/ratis/server/impl/LeaderStateImpl.java     |  8 +++++---
 .../java/org/apache/ratis/server/impl/PendingStepDown.java     | 10 +++++++++-
 .../src/test/java/org/apache/ratis/RaftBasicTests.java         |  6 +++---
 3 files changed, 17 insertions(+), 7 deletions(-)

diff --git 
a/ratis-server/src/main/java/org/apache/ratis/server/impl/LeaderStateImpl.java 
b/ratis-server/src/main/java/org/apache/ratis/server/impl/LeaderStateImpl.java
index 8358f063d..0835802bd 100644
--- 
a/ratis-server/src/main/java/org/apache/ratis/server/impl/LeaderStateImpl.java
+++ 
b/ratis-server/src/main/java/org/apache/ratis/server/impl/LeaderStateImpl.java
@@ -703,10 +703,12 @@ class LeaderStateImpl implements LeaderState {
   private void stepDown(long term, StepDownReason reason) {
     try {
       lease.getAndSetEnabled(false);
-      server.changeToFollowerAndPersistMetadata(term, false, reason).join();
+      server.changeToFollowerAndPersistMetadata(term, false, reason)
+          .get(5, TimeUnit.SECONDS);
       pendingStepDown.complete(server::newSuccessReply);
-    } catch(IOException e) {
-      final String s = this + ": Failed to persist metadata for term " + term;
+    } catch(Exception e) {
+      pendingStepDown.completeExceptionally(e);
+      final String s = this + ": Failed to step down for term " + term;
       LOG.warn(s, e);
       // the failure should happen while changing the state to follower
       // thus the in-memory state should have been updated
diff --git 
a/ratis-server/src/main/java/org/apache/ratis/server/impl/PendingStepDown.java 
b/ratis-server/src/main/java/org/apache/ratis/server/impl/PendingStepDown.java
index b7bfde3f6..c1e5cc5f5 100644
--- 
a/ratis-server/src/main/java/org/apache/ratis/server/impl/PendingStepDown.java
+++ 
b/ratis-server/src/main/java/org/apache/ratis/server/impl/PendingStepDown.java
@@ -56,8 +56,12 @@ public class PendingStepDown {
       replyFuture.complete(newSuccessReply.apply(request));
     }
 
+    void completeExceptionally(Exception e) {
+      replyFuture.completeExceptionally(e);
+    }
+
     void timeout() {
-      replyFuture.completeExceptionally(new TimeoutIOException(
+      completeExceptionally(new TimeoutIOException(
           ": Failed to step down leader on " +  leader + "request " + 
request.getTimeoutMs() + "ms"));
     }
 
@@ -105,6 +109,10 @@ public class PendingStepDown {
     pending.getAndSetNull().ifPresent(p -> p.complete(newSuccessReply));
   }
 
+  void completeExceptionally(Exception e) {
+    pending.getAndSetNull().ifPresent(p -> p.completeExceptionally(e));
+  }
+
   void timeout() {
     pending.getAndSetNull().ifPresent(PendingRequest::timeout);
   }
diff --git a/ratis-server/src/test/java/org/apache/ratis/RaftBasicTests.java 
b/ratis-server/src/test/java/org/apache/ratis/RaftBasicTests.java
index 156cecf0b..f1319cde7 100644
--- a/ratis-server/src/test/java/org/apache/ratis/RaftBasicTests.java
+++ b/ratis-server/src/test/java/org/apache/ratis/RaftBasicTests.java
@@ -360,7 +360,7 @@ public abstract class RaftBasicTests<CLUSTER extends 
MiniRaftCluster>
 
     final Timer timer = new Timer();
     timer.schedule(new TimerTask() {
-      private int previousLastStep = lastStep.get();
+      private final AtomicInteger previousLastStep = new 
AtomicInteger(lastStep.get());
 
       @Override
       public void run() {
@@ -371,8 +371,8 @@ public abstract class RaftBasicTests<CLUSTER extends 
MiniRaftCluster>
         JavaUtils.dumpAllThreads(s -> log.info(s));
 
         final int last = lastStep.get();
-        if (last != previousLastStep) {
-          previousLastStep = last;
+        if (last != previousLastStep.get()) {
+          previousLastStep.set(last);
         } else {
           final RaftServer.Division leader = cluster.getLeader();
           log.info("NO PROGRESS at " + last + ", try to restart leader=" + 
leader);

Reply via email to