(dubbo-go) branch develop updated: fix(cluster/failback): add exponential backoff for retry mechanism (#3180)

alexstocks Mon, 26 Jan 2026 18:18:50 -0800

This is an automated email from the ASF dual-hosted git repository.

alexstocks pushed a commit to branch develop
in repository https://gitbox.apache.org/repos/asf/dubbo-go.git



The following commit(s) were added to refs/heads/develop by this push:
     new cb9c5d770 fix(cluster/failback): add exponential backoff for retry 
mechanism (#3180)
cb9c5d770 is described below

commit cb9c5d770d652d055d2b618b7e53530cb894dd6d
Author: CAICAII <[email protected]>
AuthorDate: Tue Jan 27 10:18:36 2026 +0800

    fix(cluster/failback): add exponential backoff for retry mechanism (#3180)
    
    Replace fixed 5-second retry interval with exponential backoff (1s→60s)
    to prevent retry storms when downstream services have extended outages.
    
    - Add github.com/cenkalti/backoff/v4 for exponential backoff
    - New fields in retryTimerTask: nextBackoff, backoff
    - First retry after ~1s, increases exponentially up to 60s max
    - Update tests to reflect new backoff behavior
    
    Signed-off-by: Zhuanz <[email protected]>
---
 cluster/cluster/failback/cluster_invoker.go | 27 +++++++++++++++++++-----
 cluster/cluster/failback/cluster_test.go    | 32 +++++++++++++++++------------
 2 files changed, 41 insertions(+), 18 deletions(-)

diff --git a/cluster/cluster/failback/cluster_invoker.go 
b/cluster/cluster/failback/cluster_invoker.go
index 442e05121..4bf385259 100644
--- a/cluster/cluster/failback/cluster_invoker.go
+++ b/cluster/cluster/failback/cluster_invoker.go
@@ -27,6 +27,8 @@ import (
 import (
        "github.com/Workiva/go-datastructures/queue"
 
+       "github.com/cenkalti/backoff/v4"
+
        "github.com/dubbogo/gost/log/logger"
 )
 
@@ -103,7 +105,8 @@ func (invoker *failbackClusterInvoker) process(ctx 
context.Context) {
                        }
 
                        retryTask := value.(*retryTimerTask)
-                       if time.Since(retryTask.lastT).Seconds() < 5 {
+                       // use exponential backoff calculated wait time instead 
of fixed 5 seconds
+                       if time.Since(retryTask.lastT) < retryTask.nextBackoff {
                                break
                        }
 
@@ -182,34 +185,48 @@ type retryTimerTask struct {
        retries        int64
        maxRetries     int64
        lastT          time.Time
+       nextBackoff    time.Duration               // next retry wait duration
+       backoff        *backoff.ExponentialBackOff // exponential backoff 
calculator
        clusterInvoker *failbackClusterInvoker
        lastErr        error
 }
 
 func (t *retryTimerTask) checkRetry() {
-       logger.Errorf("Failed retry to invoke the method %v in the service %v, 
wait again. The exception: %v.\n",
+       logger.Errorf("Failed retry to invoke the method %v in the service %v, 
wait again. The exception: %v",
                t.invocation.MethodName(), t.clusterInvoker.GetURL().Service(), 
t.lastErr)
        t.retries++
-       t.lastT = time.Now()
-       if t.retries > t.maxRetries {
-               logger.Errorf("Retry times exceed threshold (%v), invocation-> 
%v.\n",
+       t.nextBackoff = t.backoff.NextBackOff() // calculate next exponential 
backoff wait time
+
+       if t.retries > t.maxRetries || t.nextBackoff == backoff.Stop {
+               logger.Errorf("Retry times exceed threshold (%v), invocation-> 
%v",
                        t.retries, t.invocation)
                return
        }
 
+       logger.Infof("Failback retry scheduled after %v for method %v", 
t.nextBackoff, t.invocation.MethodName())
+
        if err := t.clusterInvoker.taskList.Put(t); err != nil {
                logger.Errorf("invoker.taskList.Put(retryTask:%#v) = error:%v", 
t, err)
+               return
        }
+       t.lastT = time.Now() // update lastT after successful Put
 }
 
 func newRetryTimerTask(loadbalance loadbalance.LoadBalance, invocation 
protocolbase.Invocation, invokers []protocolbase.Invoker,
        lastInvoker protocolbase.Invoker, cInvoker *failbackClusterInvoker) 
*retryTimerTask {
+       bo := backoff.NewExponentialBackOff()
+       bo.InitialInterval = 1 * time.Second
+       bo.MaxInterval = 60 * time.Second
+       bo.MaxElapsedTime = 0 // never timeout
+
        task := &retryTimerTask{
                loadbalance:    loadbalance,
                invocation:     invocation,
                invokers:       invokers,
                lastInvoker:    lastInvoker,
                lastT:          time.Now(),
+               backoff:        bo,
+               nextBackoff:    bo.NextBackOff(),
                clusterInvoker: cInvoker,
        }
 
diff --git a/cluster/cluster/failback/cluster_test.go 
b/cluster/cluster/failback/cluster_test.go
index 53db074c7..872eed76f 100644
--- a/cluster/cluster/failback/cluster_test.go
+++ b/cluster/cluster/failback/cluster_test.go
@@ -21,6 +21,7 @@ import (
        "context"
        "fmt"
        "sync"
+       "sync/atomic"
        "testing"
        "time"
 )
@@ -107,7 +108,8 @@ func TestFailbackRetryOneSuccess(t *testing.T) {
        invoker.EXPECT().IsAvailable().Return(true)
        invoker.EXPECT().Invoke(gomock.Any(), 
gomock.Any()).DoAndReturn(func(context.Context, base.Invocation) result.Result {
                delta := time.Since(now).Nanoseconds() / int64(time.Second)
-               assert.GreaterOrEqual(t, delta, int64(5))
+               // with exponential backoff, first retry happens after ~1s 
instead of 5s
+               assert.GreaterOrEqual(t, delta, int64(1))
                wg.Done()
                return mockSuccResult
        })
@@ -147,14 +149,11 @@ func TestFailbackRetryFailed(t *testing.T) {
        var wg sync.WaitGroup
        retries := 2
        wg.Add(retries)
-       now := time.Now()
 
        // add retry call that eventually failed.
        for i := 0; i < retries; i++ {
-               j := i + 1
                invoker.EXPECT().Invoke(gomock.Any(), 
gomock.Any()).DoAndReturn(func(context.Context, base.Invocation) result.Result {
-                       delta := time.Since(now).Nanoseconds() / 
int64(time.Second)
-                       assert.GreaterOrEqual(t, delta, int64(5*j))
+                       // with exponential backoff, retries happen with 
increasing intervals starting from ~1s
                        wg.Done()
                        return mockFailedResult
                })
@@ -168,7 +167,8 @@ func TestFailbackRetryFailed(t *testing.T) {
 
        wg.Wait()
        time.Sleep(time.Second)
-       assert.Equal(t, int64(1), clusterInvoker.taskList.Len())
+       // with exponential backoff, after 2 failed retries the task is 
re-queued for next attempt
+       assert.GreaterOrEqual(t, clusterInvoker.taskList.Len(), int64(1))
 
        invoker.EXPECT().Destroy().Return()
        clusterInvoker.Destroy()
@@ -193,15 +193,17 @@ func TestFailbackRetryFailed10Times(t *testing.T) {
        invoker.EXPECT().Invoke(gomock.Any(), 
gomock.Any()).Return(mockFailedResult).Times(10)
 
        // 10 task should retry and failed.
-       var wg sync.WaitGroup
-       wg.Add(10)
+       // With exponential backoff (starting at ~1s), retries happen faster 
than the old fixed 5s interval.
+       // Use atomic counter to safely track retries across goroutines.
+       var retryCount int64
        now := time.Now()
        invoker.EXPECT().Invoke(gomock.Any(), 
gomock.Any()).DoAndReturn(func(context.Context, base.Invocation) result.Result {
                delta := time.Since(now).Nanoseconds() / int64(time.Second)
-               assert.GreaterOrEqual(t, delta, int64(5))
-               wg.Done()
+               // with exponential backoff, first retry happens after ~1s 
instead of 5s
+               assert.GreaterOrEqual(t, delta, int64(1))
+               atomic.AddInt64(&retryCount, 1)
                return mockFailedResult
-       }).Times(10)
+       }).MinTimes(10)
 
        for i := 0; i < 10; i++ {
                result := clusterInvoker.Invoke(context.Background(), 
&invocation.RPCInvocation{})
@@ -210,9 +212,13 @@ func TestFailbackRetryFailed10Times(t *testing.T) {
                assert.Empty(t, result.Attachments())
        }
 
-       wg.Wait()
+       // Wait for at least 10 retries to complete
+       for atomic.LoadInt64(&retryCount) < 10 {
+               time.Sleep(100 * time.Millisecond)
+       }
        time.Sleep(time.Second) // in order to ensure checkRetry have done
-       assert.Equal(t, int64(10), clusterInvoker.taskList.Len())
+       // With exponential backoff, tasks are re-queued after each retry
+       assert.GreaterOrEqual(t, clusterInvoker.taskList.Len(), int64(1))
 
        invoker.EXPECT().Destroy().Return()
        clusterInvoker.Destroy()

(dubbo-go) branch develop updated: fix(cluster/failback): add exponential backoff for retry mechanism (#3180)

Reply via email to