Re: [PR] IGNITE-27714 Complete RAFT client [ignite-3]

via GitHub Mon, 02 Mar 2026 07:02:30 -0800


Cyrill commented on code in PR #7507:
URL: https://github.com/apache/ignite-3/pull/7507#discussion_r2872926206



##########
modules/raft/src/test/java/org/apache/ignite/internal/raft/client/PhysicalTopologyAwareRaftGroupServiceRunTest.java:
##########
@@ -880,49 +828,222 @@ private void verifyExact3PeersCalled() {
         assertThat(triedPeers, equalTo(expectedPeers));
     }
 
-    private static class TestResponse {
+    // 
==========================================================================================
+    // Tests for refreshLeader and refreshAndGetLeaderWithTerm (RANDOM 
strategy)
+    // 
==========================================================================================
+
+    /**
+     * Tests that for GetLeaderRequest with UNKNOWN/EINTERNAL/ENOENT errors, 
the executor tries another peer
+     * instead of retrying the same peer. This matches RaftGroupServiceImpl 
behavior.
+     *
+     * <p>The test verifies that after receiving one of these errors from a 
peer, the next request goes to
+     * a DIFFERENT peer, not the same one.
+     */
+    @ParameterizedTest
+    @EnumSource(names = {"UNKNOWN", "EINTERNAL", "ENOENT"})
+    void testGetLeaderRequestTriesDifferentPeerOnTransientError(RaftError 
error) throws Exception {
+        Set<String> calledPeers = ConcurrentHashMap.newKeySet();
+        AtomicInteger callCount = new AtomicInteger(0);
+
+        when(messagingService.invoke(
+                any(InternalClusterNode.class),
+                any(GetLeaderRequest.class),
+                anyLong())
+        ).thenAnswer(invocation -> {
+            InternalClusterNode target = invocation.getArgument(0);
+            calledPeers.add(target.name());
+            int count = callCount.incrementAndGet();
+
+            if (count == 1) {
+                // First call returns transient error.
+                return completedFuture(FACTORY.errorResponse()
+                        .errorCode(error.getNumber())
+                        .build());
+            }
+
+            // Second call succeeds.
+            return completedFuture(FACTORY.getLeaderResponse()
+                    .leaderId(PeerId.fromPeer(NODES.get(0)).toString())
+                    .currentTerm(CURRENT_TERM)
+                    .build());
+        });
+
+        PhysicalTopologyAwareRaftGroupService svc = startService();
+
+        CompletableFuture<LeaderWithTerm> result = 
svc.refreshAndGetLeaderWithTerm(TIMEOUT);
+
+        assertThat(result, willCompleteSuccessfully());
+
+        // Verify that at least 2 different peers were called.
+        // If the same peer was retried, calledPeers would have size 1.
+        assertThat("Should try different peers on " + error + " error, but 
called peers were: " + calledPeers,
+                calledPeers.size(), greaterThan(1));
     }
 
     /**
-     * Tests single-attempt mode (timeout=0) with 5 nodes: all return "no 
leader".
+     * Tests that RANDOM strategy with bounded timeout keeps cycling through 
peers until timeout.
      *
-     * <p>In single-attempt mode, "no leader" is treated same as unavailable.
-     * Each peer is tried exactly once, then fails with 
ReplicationGroupUnavailableException.
+     * <p>When all peers return EPERM (no leader), RaftGroupServiceImpl resets 
unavailable peers
+     * and keeps cycling until timeout. RaftCommandExecutor should do the same 
for RANDOM strategy.
      */
     @Test
-    void testSingleAttemptModeWithAllNoLeader() {
+    void testRandomStrategyWithBoundedTimeoutKeepsCyclingUntilTimeout() throws 
Exception {
+        AtomicInteger callCount = new AtomicInteger(0);
+        CountDownLatch sixCallsReached = new CountDownLatch(6);
+
+        // All peers return EPERM (no leader) every time.
+        when(messagingService.invoke(
+                any(InternalClusterNode.class),
+                any(GetLeaderRequest.class),
+                anyLong())
+        ).thenAnswer(invocation -> {
+            callCount.incrementAndGet();
+            sixCallsReached.countDown();
+            return completedFuture(FACTORY.errorResponse()
+                    .errorCode(RaftError.EPERM.getNumber())
+                    .leaderId(null)
+                    .build());
+        });
+
+        PhysicalTopologyAwareRaftGroupService svc = startService();
+
+        // With bounded timeout (500ms), should keep cycling until timeout.
+        // With 3 peers and 50ms retry delay, we should get more than 3 calls 
before timeout.
+        CompletableFuture<Void> result = svc.refreshLeader(500);
+
+        // Wait for at least 6 calls (2 complete cycles through all 3 peers).
+        boolean sixCallsHappened = sixCallsReached.await(2, TimeUnit.SECONDS);
+
+        assertThat(result, 
willThrow(ReplicationGroupUnavailableException.class, 2, TimeUnit.SECONDS));
+
+        // Verify more than 3 calls were made (cycling through peers multiple 
times).
+        assertThat("Should cycle through peers multiple times before timeout, 
but only " + callCount.get() + " calls were made",
+                sixCallsHappened, is(true));
+        assertThat(callCount.get(), greaterThan(3));
+    }
+
+    /**
+     * Tests that the timeout parameter is respected for 
refreshLeader/refreshAndGetLeaderWithTerm.
+     * With a short timeout, the request should fail fast, not wait for the 
default response timeout.
+     */
+    @Test
+    void testTimeoutParameterIsRespectedForRefreshLeader() {
+        // All peers never respond (simulating network issue).
+        when(messagingService.invoke(
+                any(InternalClusterNode.class),
+                any(GetLeaderRequest.class),
+                anyLong())
+        ).thenReturn(new CompletableFuture<>()); // Never completes
+
+        PhysicalTopologyAwareRaftGroupService svc = startService();
+
+        long startTime = System.currentTimeMillis();
+
+        // With 200ms timeout, should fail within ~300ms, not the default 
3000ms.
+        CompletableFuture<Void> result = svc.refreshLeader(200);
+
+        assertThat(result, 
willThrow(ReplicationGroupUnavailableException.class, 1, TimeUnit.SECONDS));
+
+        long elapsed = System.currentTimeMillis() - startTime;
+        assertThat("Expected to fail within 1000ms but took " + elapsed + 
"ms", elapsed < 1000, is(true));
+    }
+
+    /**
+     * Tests that RANDOM strategy with timeout=0 (single attempt) tries all 
peers once and fails.
+     *
+     * <p>With timeout=0, each peer should be tried at most once, then fail 
with ReplicationGroupUnavailableException.
+     */
+    @Test
+    void testRandomStrategySingleAttemptTriesAllPeersOnce() throws Exception {
         AtomicInteger callCount = new AtomicInteger(0);
         Set<String> calledPeers = ConcurrentHashMap.newKeySet();
+        CountDownLatch constructorCallsDone = new CountDownLatch(3); // 
Constructor tries all 3 peers
 
-        // All peers return EPERM with no leader.
+        // All peers return EPERM (no leader).
         when(messagingService.invoke(
                 any(InternalClusterNode.class),
-                argThat(this::isTestWriteCommand),
-                anyLong()
-        )).thenAnswer(invocation -> {
+                any(GetLeaderRequest.class),
+                anyLong())
+        ).thenAnswer(invocation -> {
             InternalClusterNode target = invocation.getArgument(0);
             calledPeers.add(target.name());
-            callCount.incrementAndGet();
+            int count = callCount.incrementAndGet();
+            // Signal when constructor's calls complete (first 3 calls).
+            if (count <= 3) {

Review Comment:
   changed



##########
modules/raft/src/test/java/org/apache/ignite/internal/raft/client/PhysicalTopologyAwareRaftGroupServiceRunTest.java:
##########
@@ -880,49 +828,222 @@ private void verifyExact3PeersCalled() {
         assertThat(triedPeers, equalTo(expectedPeers));
     }
 
-    private static class TestResponse {
+    // 
==========================================================================================
+    // Tests for refreshLeader and refreshAndGetLeaderWithTerm (RANDOM 
strategy)
+    // 
==========================================================================================
+
+    /**
+     * Tests that for GetLeaderRequest with UNKNOWN/EINTERNAL/ENOENT errors, 
the executor tries another peer
+     * instead of retrying the same peer. This matches RaftGroupServiceImpl 
behavior.
+     *
+     * <p>The test verifies that after receiving one of these errors from a 
peer, the next request goes to
+     * a DIFFERENT peer, not the same one.
+     */
+    @ParameterizedTest
+    @EnumSource(names = {"UNKNOWN", "EINTERNAL", "ENOENT"})
+    void testGetLeaderRequestTriesDifferentPeerOnTransientError(RaftError 
error) throws Exception {
+        Set<String> calledPeers = ConcurrentHashMap.newKeySet();
+        AtomicInteger callCount = new AtomicInteger(0);
+
+        when(messagingService.invoke(
+                any(InternalClusterNode.class),
+                any(GetLeaderRequest.class),
+                anyLong())
+        ).thenAnswer(invocation -> {
+            InternalClusterNode target = invocation.getArgument(0);
+            calledPeers.add(target.name());
+            int count = callCount.incrementAndGet();
+
+            if (count == 1) {
+                // First call returns transient error.
+                return completedFuture(FACTORY.errorResponse()
+                        .errorCode(error.getNumber())
+                        .build());
+            }
+
+            // Second call succeeds.
+            return completedFuture(FACTORY.getLeaderResponse()
+                    .leaderId(PeerId.fromPeer(NODES.get(0)).toString())
+                    .currentTerm(CURRENT_TERM)
+                    .build());
+        });
+
+        PhysicalTopologyAwareRaftGroupService svc = startService();
+
+        CompletableFuture<LeaderWithTerm> result = 
svc.refreshAndGetLeaderWithTerm(TIMEOUT);
+
+        assertThat(result, willCompleteSuccessfully());
+
+        // Verify that at least 2 different peers were called.
+        // If the same peer was retried, calledPeers would have size 1.
+        assertThat("Should try different peers on " + error + " error, but 
called peers were: " + calledPeers,
+                calledPeers.size(), greaterThan(1));
     }
 
     /**
-     * Tests single-attempt mode (timeout=0) with 5 nodes: all return "no 
leader".
+     * Tests that RANDOM strategy with bounded timeout keeps cycling through 
peers until timeout.
      *
-     * <p>In single-attempt mode, "no leader" is treated same as unavailable.
-     * Each peer is tried exactly once, then fails with 
ReplicationGroupUnavailableException.
+     * <p>When all peers return EPERM (no leader), RaftGroupServiceImpl resets 
unavailable peers
+     * and keeps cycling until timeout. RaftCommandExecutor should do the same 
for RANDOM strategy.
      */
     @Test
-    void testSingleAttemptModeWithAllNoLeader() {
+    void testRandomStrategyWithBoundedTimeoutKeepsCyclingUntilTimeout() throws 
Exception {
+        AtomicInteger callCount = new AtomicInteger(0);
+        CountDownLatch sixCallsReached = new CountDownLatch(6);

Review Comment:
   changes



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] IGNITE-27714 Complete RAFT client [ignite-3]

Reply via email to