(dubbo-go) branch develop updated: fix(cluster/failback): stabilize flaky tests for exponential backoff (#3187)

alexstocks Tue, 03 Feb 2026 19:37:23 -0800

This is an automated email from the ASF dual-hosted git repository.

alexstocks pushed a commit to branch develop
in repository https://gitbox.apache.org/repos/asf/dubbo-go.git



The following commit(s) were added to refs/heads/develop by this push:
     new c2b0e2f4c fix(cluster/failback): stabilize flaky tests for exponential 
backoff (#3187)
c2b0e2f4c is described below

commit c2b0e2f4c9372b6666ac353efff0e4e597a69a5d
Author: CAICAII <[email protected]>
AuthorDate: Wed Feb 4 11:35:47 2026 +0800

    fix(cluster/failback): stabilize flaky tests for exponential backoff (#3187)
    
    * fix(cluster/failback): stabilize flaky tests for exponential backoff
    
    Replace WaitGroup-based synchronization with atomic counter and polling
    to handle non-deterministic timing from exponential backoff with jitter.
    
    - Use atomic.Int64 counter instead of WaitGroup for retry tracking
    - Use MinTimes() instead of fixed Times() for gomock expectations
    - Remove time-based assertions that fail due to randomization factor
    
    Signed-off-by: Zhuanz <[email protected]>
    
    * test(cluster/failback): replace unbounded polling with require.Eventually
    
    Replace unbounded for-loops and time.Sleep with require.Eventually to add
    timeout protection for flaky test synchronization.
    
    - Add bounded timeout (10s/30s) to prevent hanging tests
    - Replace fixed time.Sleep with condition-based waiting
    - Split retry and taskList waits for clearer failure diagnosis
    
    Signed-off-by: Zhuanz <[email protected]>
    
    * test(cluster/failback): merge with develop - resolve conflict
    
    Merge develop branch into fix-failback-flaky-test-clean.
    Resolved conflicts in TestFailbackRetryFailed and 
TestFailbackRetryFailed10Times.
    
    Changes from this branch (apply Copilot review suggestions):
    - Replace unbounded polling with require.Eventually (10s/30s timeout)
    - Remove time.Sleep in favor of condition-based waiting
    - Split retry and taskList waits for clearer failure diagnosis
    
    Signed-off-by: Zhuanz <[email protected]>
    
    ---------
    
    Signed-off-by: Zhuanz <[email protected]>
    Signed-off-by: CAICAIIs
---
 cluster/cluster/failback/cluster_test.go | 52 ++++++++++++++++----------------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/cluster/cluster/failback/cluster_test.go 
b/cluster/cluster/failback/cluster_test.go
index 60aa08cd2..89f9ae58e 100644
--- a/cluster/cluster/failback/cluster_test.go
+++ b/cluster/cluster/failback/cluster_test.go
@@ -131,7 +131,7 @@ func TestFailbackRetryOneSuccess(t *testing.T) {
        assert.Equal(t, int64(0), clusterInvoker.taskList.Len())
 }
 
-// failed firstly, and failed again after ech retry time
+// failed firstly, and failed again after each retry time.
 func TestFailbackRetryFailed(t *testing.T) {
        ctrl := gomock.NewController(t)
        defer ctrl.Finish()
@@ -145,19 +145,16 @@ func TestFailbackRetryFailed(t *testing.T) {
        mockFailedResult := &result.RPCResult{Err: perrors.New("error")}
        invoker.EXPECT().Invoke(gomock.Any(), 
gomock.Any()).Return(mockFailedResult)
 
-       var wg sync.WaitGroup
-       var retryCount atomic.Int64
-       retries := 2
-       wg.Add(retries)
+       // Use atomic counter to safely track retries across goroutines.
+       // With exponential backoff and randomization factor, timing is 
non-deterministic.
+       var retryCount int64
+       targetRetries := int64(2)
 
-       // add retry call that eventually failed.
+       // add retry calls that eventually failed.
        invoker.EXPECT().Invoke(gomock.Any(), 
gomock.Any()).DoAndReturn(func(context.Context, base.Invocation) result.Result {
-               // with exponential backoff, retries happen with increasing 
intervals starting from ~1s
-               if retryCount.Add(1) <= int64(retries) {
-                       wg.Done()
-               }
+               atomic.AddInt64(&retryCount, 1)
                return mockFailedResult
-       }).MinTimes(retries)
+       }).MinTimes(int(targetRetries))
 
        // first call should failed.
        result := clusterInvoker.Invoke(context.Background(), 
&invocation.RPCInvocation{})
@@ -165,10 +162,15 @@ func TestFailbackRetryFailed(t *testing.T) {
        assert.Nil(t, result.Result())
        assert.Empty(t, result.Attachments())
 
-       wg.Wait()
-       time.Sleep(time.Second)
-       // with exponential backoff, after 2 failed retries the task is 
re-queued for next attempt
-       assert.GreaterOrEqual(t, clusterInvoker.taskList.Len(), int64(1))
+       // Wait for at least targetRetries to complete, with bounded timeout to 
avoid hanging tests
+       require.Eventually(t, func() bool {
+               return atomic.LoadInt64(&retryCount) >= targetRetries
+       }, 10*time.Second, 100*time.Millisecond)
+
+       // Wait for task to be re-queued after retries (with timeout)
+       require.Eventually(t, func() bool {
+               return clusterInvoker.taskList.Len() >= int64(1)
+       }, 5*time.Second, 100*time.Millisecond)
 
        invoker.EXPECT().Destroy().Return()
        clusterInvoker.Destroy()
@@ -197,11 +199,7 @@ func TestFailbackRetryFailed10Times(t *testing.T) {
        // With exponential backoff (starting at ~1s), retries happen faster 
than the old fixed 5s interval.
        // Use atomic counter to safely track retries across goroutines.
        var retryCount int64
-       now := time.Now()
        invoker.EXPECT().Invoke(gomock.Any(), 
gomock.Any()).DoAndReturn(func(context.Context, base.Invocation) result.Result {
-               delta := time.Since(now).Nanoseconds() / int64(time.Second)
-               // with exponential backoff, first retry happens after ~1s 
instead of 5s
-               assert.GreaterOrEqual(t, delta, int64(1))
                atomic.AddInt64(&retryCount, 1)
                return mockFailedResult
        }).MinTimes(10)
@@ -213,13 +211,15 @@ func TestFailbackRetryFailed10Times(t *testing.T) {
                assert.Empty(t, result.Attachments())
        }
 
-       // Wait for at least 10 retries to complete
-       for atomic.LoadInt64(&retryCount) < 10 {
-               time.Sleep(100 * time.Millisecond)
-       }
-       time.Sleep(time.Second) // in order to ensure checkRetry have done
-       // With exponential backoff, tasks are re-queued after each retry
-       assert.GreaterOrEqual(t, clusterInvoker.taskList.Len(), int64(1))
+       // Wait for at least 10 retries to complete, with bounded timeout
+       require.Eventually(t, func() bool {
+               return atomic.LoadInt64(&retryCount) >= 10
+       }, 30*time.Second, 100*time.Millisecond)
+
+       // Wait for tasks to be re-queued after retries
+       require.Eventually(t, func() bool {
+               return clusterInvoker.taskList.Len() >= int64(1)
+       }, 5*time.Second, 100*time.Millisecond)
 
        invoker.EXPECT().Destroy().Return()
        clusterInvoker.Destroy()

(dubbo-go) branch develop updated: fix(cluster/failback): stabilize flaky tests for exponential backoff (#3187)

Reply via email to