This is an automated email from the ASF dual-hosted git repository.
alexstocks pushed a commit to branch develop
in repository https://gitbox.apache.org/repos/asf/dubbo-go.git
The following commit(s) were added to refs/heads/develop by this push:
new cb9c5d770 fix(cluster/failback): add exponential backoff for retry
mechanism (#3180)
cb9c5d770 is described below
commit cb9c5d770d652d055d2b618b7e53530cb894dd6d
Author: CAICAII <[email protected]>
AuthorDate: Tue Jan 27 10:18:36 2026 +0800
fix(cluster/failback): add exponential backoff for retry mechanism (#3180)
Replace fixed 5-second retry interval with exponential backoff (1s→60s)
to prevent retry storms when downstream services have extended outages.
- Add github.com/cenkalti/backoff/v4 for exponential backoff
- New fields in retryTimerTask: nextBackoff, backoff
- First retry after ~1s, increases exponentially up to 60s max
- Update tests to reflect new backoff behavior
Signed-off-by: Zhuanz <[email protected]>
---
cluster/cluster/failback/cluster_invoker.go | 27 +++++++++++++++++++-----
cluster/cluster/failback/cluster_test.go | 32 +++++++++++++++++------------
2 files changed, 41 insertions(+), 18 deletions(-)
diff --git a/cluster/cluster/failback/cluster_invoker.go
b/cluster/cluster/failback/cluster_invoker.go
index 442e05121..4bf385259 100644
--- a/cluster/cluster/failback/cluster_invoker.go
+++ b/cluster/cluster/failback/cluster_invoker.go
@@ -27,6 +27,8 @@ import (
import (
"github.com/Workiva/go-datastructures/queue"
+ "github.com/cenkalti/backoff/v4"
+
"github.com/dubbogo/gost/log/logger"
)
@@ -103,7 +105,8 @@ func (invoker *failbackClusterInvoker) process(ctx
context.Context) {
}
retryTask := value.(*retryTimerTask)
- if time.Since(retryTask.lastT).Seconds() < 5 {
+ // use exponential backoff calculated wait time instead
of fixed 5 seconds
+ if time.Since(retryTask.lastT) < retryTask.nextBackoff {
break
}
@@ -182,34 +185,48 @@ type retryTimerTask struct {
retries int64
maxRetries int64
lastT time.Time
+ nextBackoff time.Duration // next retry wait duration
+ backoff *backoff.ExponentialBackOff // exponential backoff
calculator
clusterInvoker *failbackClusterInvoker
lastErr error
}
func (t *retryTimerTask) checkRetry() {
- logger.Errorf("Failed retry to invoke the method %v in the service %v,
wait again. The exception: %v.\n",
+ logger.Errorf("Failed retry to invoke the method %v in the service %v,
wait again. The exception: %v",
t.invocation.MethodName(), t.clusterInvoker.GetURL().Service(),
t.lastErr)
t.retries++
- t.lastT = time.Now()
- if t.retries > t.maxRetries {
- logger.Errorf("Retry times exceed threshold (%v), invocation->
%v.\n",
+ t.nextBackoff = t.backoff.NextBackOff() // calculate next exponential
backoff wait time
+
+ if t.retries > t.maxRetries || t.nextBackoff == backoff.Stop {
+ logger.Errorf("Retry times exceed threshold (%v), invocation->
%v",
t.retries, t.invocation)
return
}
+ logger.Infof("Failback retry scheduled after %v for method %v",
t.nextBackoff, t.invocation.MethodName())
+
if err := t.clusterInvoker.taskList.Put(t); err != nil {
logger.Errorf("invoker.taskList.Put(retryTask:%#v) = error:%v",
t, err)
+ return
}
+ t.lastT = time.Now() // update lastT after successful Put
}
func newRetryTimerTask(loadbalance loadbalance.LoadBalance, invocation
protocolbase.Invocation, invokers []protocolbase.Invoker,
lastInvoker protocolbase.Invoker, cInvoker *failbackClusterInvoker)
*retryTimerTask {
+ bo := backoff.NewExponentialBackOff()
+ bo.InitialInterval = 1 * time.Second
+ bo.MaxInterval = 60 * time.Second
+ bo.MaxElapsedTime = 0 // never timeout
+
task := &retryTimerTask{
loadbalance: loadbalance,
invocation: invocation,
invokers: invokers,
lastInvoker: lastInvoker,
lastT: time.Now(),
+ backoff: bo,
+ nextBackoff: bo.NextBackOff(),
clusterInvoker: cInvoker,
}
diff --git a/cluster/cluster/failback/cluster_test.go
b/cluster/cluster/failback/cluster_test.go
index 53db074c7..872eed76f 100644
--- a/cluster/cluster/failback/cluster_test.go
+++ b/cluster/cluster/failback/cluster_test.go
@@ -21,6 +21,7 @@ import (
"context"
"fmt"
"sync"
+ "sync/atomic"
"testing"
"time"
)
@@ -107,7 +108,8 @@ func TestFailbackRetryOneSuccess(t *testing.T) {
invoker.EXPECT().IsAvailable().Return(true)
invoker.EXPECT().Invoke(gomock.Any(),
gomock.Any()).DoAndReturn(func(context.Context, base.Invocation) result.Result {
delta := time.Since(now).Nanoseconds() / int64(time.Second)
- assert.GreaterOrEqual(t, delta, int64(5))
+ // with exponential backoff, first retry happens after ~1s
instead of 5s
+ assert.GreaterOrEqual(t, delta, int64(1))
wg.Done()
return mockSuccResult
})
@@ -147,14 +149,11 @@ func TestFailbackRetryFailed(t *testing.T) {
var wg sync.WaitGroup
retries := 2
wg.Add(retries)
- now := time.Now()
// add retry call that eventually failed.
for i := 0; i < retries; i++ {
- j := i + 1
invoker.EXPECT().Invoke(gomock.Any(),
gomock.Any()).DoAndReturn(func(context.Context, base.Invocation) result.Result {
- delta := time.Since(now).Nanoseconds() /
int64(time.Second)
- assert.GreaterOrEqual(t, delta, int64(5*j))
+ // with exponential backoff, retries happen with
increasing intervals starting from ~1s
wg.Done()
return mockFailedResult
})
@@ -168,7 +167,8 @@ func TestFailbackRetryFailed(t *testing.T) {
wg.Wait()
time.Sleep(time.Second)
- assert.Equal(t, int64(1), clusterInvoker.taskList.Len())
+ // with exponential backoff, after 2 failed retries the task is
re-queued for next attempt
+ assert.GreaterOrEqual(t, clusterInvoker.taskList.Len(), int64(1))
invoker.EXPECT().Destroy().Return()
clusterInvoker.Destroy()
@@ -193,15 +193,17 @@ func TestFailbackRetryFailed10Times(t *testing.T) {
invoker.EXPECT().Invoke(gomock.Any(),
gomock.Any()).Return(mockFailedResult).Times(10)
// 10 task should retry and failed.
- var wg sync.WaitGroup
- wg.Add(10)
+ // With exponential backoff (starting at ~1s), retries happen faster
than the old fixed 5s interval.
+ // Use atomic counter to safely track retries across goroutines.
+ var retryCount int64
now := time.Now()
invoker.EXPECT().Invoke(gomock.Any(),
gomock.Any()).DoAndReturn(func(context.Context, base.Invocation) result.Result {
delta := time.Since(now).Nanoseconds() / int64(time.Second)
- assert.GreaterOrEqual(t, delta, int64(5))
- wg.Done()
+ // with exponential backoff, first retry happens after ~1s
instead of 5s
+ assert.GreaterOrEqual(t, delta, int64(1))
+ atomic.AddInt64(&retryCount, 1)
return mockFailedResult
- }).Times(10)
+ }).MinTimes(10)
for i := 0; i < 10; i++ {
result := clusterInvoker.Invoke(context.Background(),
&invocation.RPCInvocation{})
@@ -210,9 +212,13 @@ func TestFailbackRetryFailed10Times(t *testing.T) {
assert.Empty(t, result.Attachments())
}
- wg.Wait()
+ // Wait for at least 10 retries to complete
+ for atomic.LoadInt64(&retryCount) < 10 {
+ time.Sleep(100 * time.Millisecond)
+ }
time.Sleep(time.Second) // in order to ensure checkRetry have done
- assert.Equal(t, int64(10), clusterInvoker.taskList.Len())
+ // With exponential backoff, tasks are re-queued after each retry
+ assert.GreaterOrEqual(t, clusterInvoker.taskList.Len(), int64(1))
invoker.EXPECT().Destroy().Return()
clusterInvoker.Destroy()